In [1]:
import quandl
import pandas as pd
import numpy as np
import datetime as dt
import pandas_profiling
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
In [2]:
# import libraries here; add more as necessary
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import preprocessing

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.model_selection import train_test_split
from collections import defaultdict
# Import supplementary visualization code visuals.py
#import visuals as vs
from numpy import concatenate

# magic word for producing visualizations in notebook.allow plots to appear directly in the notebook
%matplotlib inline
In [3]:
#LSTM

# magic word for producing visualizations in notebook.allow plots to appear directly in the notebook
%matplotlib inline
from subprocess import check_output
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.layers import LSTM, CuDNNLSTM , BatchNormalization
import tensorflow as tf
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
import time
from numpy import newaxis

#fbProphet libraries
from fbprophet import Prophet
# plt.style.available
plt.style.use("seaborn-whitegrid")
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import statsmodels.api as sm
from scipy import stats
Using TensorFlow backend.
In [4]:
quandl.ApiConfig.api_key = "v5Bazu_S389s29HiutZh"
In [5]:
#Get data from Quandl APIs into dataframes
SP500_DIV_YIELD_MONTH = quandl.get('MULTPL/SP500_DIV_YIELD_MONTH') #MULTPLkeys[0]
SP500_PE_RATIO_MONTH = quandl.get('MULTPL/SP500_PE_RATIO_MONTH')
SHILLER_PE_RATIO_MONTH = quandl.get('MULTPL/SHILLER_PE_RATIO_MONTH')
SP500_EARNINGS_YIELD_MONTH = quandl.get('MULTPL/SP500_EARNINGS_YIELD_MONTH')
SP500_INFLADJ_MONTH = quandl.get('MULTPL/SP500_INFLADJ_MONTH')#MULTPLkeys[4]

SP500_PSR_QUARTER = quandl.get('MULTPL/SP500_PSR_QUARTER')
SP500_DIV_MONTH = quandl.get('MULTPL/SP500_DIV_MONTH')
SP500_DIV_YEAR = quandl.get('MULTPL/SP500_DIV_YEAR')
SP500_DIV_GROWTH_YEAR = quandl.get('MULTPL/SP500_DIV_GROWTH_YEAR')
SP500_DIV_GROWTH_QUARTER = quandl.get('MULTPL/SP500_DIV_GROWTH_QUARTER')
SP500_PBV_RATIO_QUARTER = quandl.get('MULTPL/SP500_PBV_RATIO_QUARTER') #MULTPLkeys[10]

SHILLER_PE_RATIO_YEAR = quandl.get('MULTPL/SHILLER_PE_RATIO_YEAR')
SP500_PE_RATIO_YEAR = quandl.get('MULTPL/SP500_PE_RATIO_YEAR')
SP500_DIV_YIELD_YEAR = quandl.get('MULTPL/SP500_DIV_YIELD_YEAR')
SP500_PSR_YEAR = quandl.get('MULTPL/SP500_PSR_YEAR')
SP500_EARNINGS_YIELD_YEAR = quandl.get('MULTPL/SP500_EARNINGS_YIELD_YEAR')
SP500_PBV_RATIO_YEAR =  quandl.get('MULTPL/SP500_PBV_RATIO_YEAR')
SP500_INFLADJ_YEAR =  quandl.get('MULTPL/SP500_INFLADJ_YEAR')
SP500_REAL_PRICE_MONTH = quandl.get('MULTPL/SP500_REAL_PRICE_MONTH') #MULTPLkeys[18]
SP500_SALES_YEAR =  quandl.get('MULTPL/SP500_SALES_YEAR')

SP500_SALES_GROWTH_YEAR = quandl.get('MULTPL/SP500_SALES_GROWTH_YEAR') #MULTPLkeys[20]
SP500_SALES_QUARTER =  quandl.get('MULTPL/SP500_SALES_QUARTER')
SP500_REAL_SALES_GROWTH_QUARTER = quandl.get('MULTPL/SP500_REAL_SALES_GROWTH_QUARTER')
SP500_SALES_GROWTH_QUARTER = quandl.get('MULTPL/SP500_SALES_GROWTH_QUARTER')
SP500_REAL_SALES_GROWTH_YEAR = quandl.get('MULTPL/SP500_REAL_SALES_GROWTH_YEAR')
SP500_REAL_EARNINGS_GROWTH_YEAR = quandl.get('MULTPL/SP500_REAL_EARNINGS_GROWTH_YEAR')
SP500_REAL_SALES_YEAR = quandl.get('MULTPL/SP500_REAL_SALES_YEAR')
SP500_REAL_EARNINGS_GROWTH_QUARTER = quandl.get('MULTPL/SP500_REAL_EARNINGS_GROWTH_QUARTER')
SP500_EARNINGS_GROWTH_QUARTER = quandl.get('MULTPL/SP500_EARNINGS_GROWTH_QUARTER')
SP500_REAL_SALES_QUARTER = quandl.get('MULTPL/SP500_REAL_SALES_QUARTER')

SP500_EARNINGS_MONTH = quandl.get('MULTPL/SP500_EARNINGS_MONTH') #MULTPLkeys[30]
SP500_BVPS_YEAR = quandl.get('MULTPL/SP500_BVPS_YEAR')
SP500_EARNINGS_YEAR = quandl.get('MULTPL/SP500_EARNINGS_YEAR')
SP500_EARNINGS_GROWTH_YEAR = quandl.get('MULTPL/SP500_EARNINGS_GROWTH_YEAR')
SP500_BVPS_QUARTER = quandl.get('MULTPL/SP500_BVPS_QUARTER')
SP500_REAL_PRICE_YEAR = quandl.get('MULTPL/SP500_REAL_PRICE_YEAR') #MULTPLkeys[35]
In [6]:
#API keywords list
MULTPLkeys = [
'MULTPL/SP500_DIV_YIELD_MONTH',
'MULTPL/SP500_PE_RATIO_MONTH',
'MULTPL/SHILLER_PE_RATIO_MONTH',
'MULTPL/SP500_EARNINGS_YIELD_MONTH',
'MULTPL/SP500_INFLADJ_MONTH',
'MULTPL/SP500_PSR_QUARTER',
'MULTPL/SP500_DIV_MONTH',
'MULTPL/SP500_DIV_YEAR',
'MULTPL/SP500_DIV_GROWTH_YEAR',
'MULTPL/SP500_DIV_GROWTH_QUARTER',
'MULTPL/SP500_PBV_RATIO_QUARTER',
'MULTPL/SHILLER_PE_RATIO_YEAR',
'MULTPL/SP500_PE_RATIO_YEAR',
'MULTPL/SP500_DIV_YIELD_YEAR',
'MULTPL/SP500_PSR_YEAR',
'MULTPL/SP500_EARNINGS_YIELD_YEAR',
'MULTPL/SP500_PBV_RATIO_YEAR',
'MULTPL/SP500_INFLADJ_YEAR',
'MULTPL/SP500_REAL_PRICE_MONTH',
'MULTPL/SP500_SALES_YEAR',
'MULTPL/SP500_SALES_GROWTH_YEAR',
'MULTPL/SP500_SALES_QUARTER',
'MULTPL/SP500_REAL_SALES_GROWTH_QUARTER',
'MULTPL/SP500_SALES_GROWTH_QUARTER',
'MULTPL/SP500_REAL_SALES_GROWTH_YEAR',
'MULTPL/SP500_REAL_EARNINGS_GROWTH_YEAR',
'MULTPL/SP500_REAL_SALES_YEAR',
'MULTPL/SP500_REAL_EARNINGS_GROWTH_QUARTER',
'MULTPL/SP500_EARNINGS_GROWTH_QUARTER',
'MULTPL/SP500_REAL_SALES_QUARTER',
'MULTPL/SP500_EARNINGS_MONTH',
'MULTPL/SP500_BVPS_YEAR',
'MULTPL/SP500_EARNINGS_YEAR',
'MULTPL/SP500_EARNINGS_GROWTH_YEAR',
'MULTPL/SP500_BVPS_QUARTER',
'MULTPL/SP500_REAL_PRICE_YEAR'
]

#Split API keywords to create df variable names
var_list = [i.split('/')[1] for i in MULTPLkeys]
print(var_list[0],var_list[1],var_list[2])
SP500_DIV_YIELD_MONTH SP500_PE_RATIO_MONTH SHILLER_PE_RATIO_MONTH
In [7]:
# def get_data(i):
#     return quandl.get('MULTPL/{}'.format(i))
In [8]:
var_list
Out[8]:
['SP500_DIV_YIELD_MONTH',
 'SP500_PE_RATIO_MONTH',
 'SHILLER_PE_RATIO_MONTH',
 'SP500_EARNINGS_YIELD_MONTH',
 'SP500_INFLADJ_MONTH',
 'SP500_PSR_QUARTER',
 'SP500_DIV_MONTH',
 'SP500_DIV_YEAR',
 'SP500_DIV_GROWTH_YEAR',
 'SP500_DIV_GROWTH_QUARTER',
 'SP500_PBV_RATIO_QUARTER',
 'SHILLER_PE_RATIO_YEAR',
 'SP500_PE_RATIO_YEAR',
 'SP500_DIV_YIELD_YEAR',
 'SP500_PSR_YEAR',
 'SP500_EARNINGS_YIELD_YEAR',
 'SP500_PBV_RATIO_YEAR',
 'SP500_INFLADJ_YEAR',
 'SP500_REAL_PRICE_MONTH',
 'SP500_SALES_YEAR',
 'SP500_SALES_GROWTH_YEAR',
 'SP500_SALES_QUARTER',
 'SP500_REAL_SALES_GROWTH_QUARTER',
 'SP500_SALES_GROWTH_QUARTER',
 'SP500_REAL_SALES_GROWTH_YEAR',
 'SP500_REAL_EARNINGS_GROWTH_YEAR',
 'SP500_REAL_SALES_YEAR',
 'SP500_REAL_EARNINGS_GROWTH_QUARTER',
 'SP500_EARNINGS_GROWTH_QUARTER',
 'SP500_REAL_SALES_QUARTER',
 'SP500_EARNINGS_MONTH',
 'SP500_BVPS_YEAR',
 'SP500_EARNINGS_YEAR',
 'SP500_EARNINGS_GROWTH_YEAR',
 'SP500_BVPS_QUARTER',
 'SP500_REAL_PRICE_YEAR']
In [9]:
# var_list1 = [
#     'SP500_DIV_YIELD_MONTH1',
#     'SP500_PE_RATIO_MONTH2',
#     'SHILLER_PE_RATIO_MONTH']
In [10]:
# for i in var_list:
#     #i = pd.DataFrame(get_data(i))
#     exec(f'{i} = get_data(i)')
#     #print(get_data(i))
#     print(i)
#     break
In [11]:
# x= 'SP500_DIV_YIELD_MONTH'
# exec("%s = %d" % (x,0))
# print(SP500_DIV_YIELD_MONTH)
# print(x)

#exec(f'{i} = get_data(i)')
In [12]:
# for i in range(len(var_list)//10):
#     x= var_list[i]    
#     exec("%s = %d" % (x,0))
#     #print(var_list[i])
#     #print(x)
#     print(SP500_DIV_YIELD_MONTH)

# # SP500_DIV_YIELD_MONTH = quandl.get(str(MULTPLkeys[0]))

# # print(SP500_DIV_YIELD_MONTH.head())
# #SP500_DIV_YIELD_MONTH
# #SP500_PE_RATIO_MONTH
# #SHILLER_PE_RATIO_MONTH

# #dynamic variables
# x= var_list[1]    
# exec("%s = %s" % (x,x))
# SP500_PE_RATIO_MONTH
In [13]:
print(SP500_REAL_PRICE_MONTH.head())
print(SP500_EARNINGS_YIELD_MONTH.head())
            Value
Date             
1871-01-01   4.44
1871-02-01   4.50
1871-03-01   4.61
1871-04-01   4.74
1871-05-01   4.86
            Value
Date             
1871-01-01   9.01
1871-02-01   8.89
1871-03-01   8.68
1871-04-01   8.44
1871-05-01   8.23

df = pd.merge(pd.merge(SP500_REAL_PRICE_MONTH,SP500_EARNINGS_YIELD_MONTH,on='Date'),SHILLER_PE_RATIO_MONTH,on='Date') df.head()

Assumptions-

  1. Assuming SP500_REAL_PRICE_MONTH is remains same on 1st of every month and last day of that month to remove NaN values.
  2. SP500_DIV_YIELD_MONTH and yield values are available for last day of every month. Assuming it remains the same on first of next month by carrying over the same value to first day of next month.
outer join between price and monthly yield df to include all dates of every month.

SP500_REAL_PRICE_MONTH is the target label hence start with this df to include all rows 
In [14]:
SP500_DIV_YIELD_MONTH.head()
Out[14]:
Value
Date
1871-01-31 5.86
1871-02-28 5.78
1871-03-31 5.64
1871-04-30 5.49
1871-05-31 5.35
In [15]:
#Join all month dataframes together on Date column.

df = SP500_REAL_PRICE_MONTH.join(SP500_DIV_YIELD_MONTH,on=None,how='outer',
                                  lsuffix='_SP500_REAL_PRICE_MONTH',rsuffix='_SP500_DIV_YIELD_MONTH',sort=False)

df= df.join(SP500_PE_RATIO_MONTH,on='Date',how='left',sort=False)

df = df.merge(
    SHILLER_PE_RATIO_MONTH,on='Date',how='left').merge(
    SP500_EARNINGS_YIELD_MONTH,on='Date',how='left').merge(
    SP500_INFLADJ_MONTH,on='Date',how='left').merge(
    SP500_PSR_QUARTER,on='Date',how='left').merge(
    SP500_DIV_MONTH,on='Date',how='outer').merge(
    SP500_DIV_YEAR,on='Date',how='left')


df.columns = ['Value_SP500_REAL_PRICE_MONTH','Value_SP500_DIV_YIELD_MONTH','Value_SP500_PE_RATIO_MONTH',
                'Value_SHILLER_PE_RATIO_MONTH','Value_SP500_EARNINGS_YIELD_MONTH','Value_SP500_INFLADJ_MONTH',
              'Value_SP500_PSR_QUARTER','Value_SP500_DIV_MONTH','Value_SP500_DIV_YEAR'
             ]

#print(df.head(6))
print(df.columns)
print(df.shape)
Index(['Value_SP500_REAL_PRICE_MONTH', 'Value_SP500_DIV_YIELD_MONTH',
       'Value_SP500_PE_RATIO_MONTH', 'Value_SHILLER_PE_RATIO_MONTH',
       'Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_INFLADJ_MONTH',
       'Value_SP500_PSR_QUARTER', 'Value_SP500_DIV_MONTH',
       'Value_SP500_DIV_YEAR'],
      dtype='object')
(3548, 9)
In [16]:
def Merge__Rename_function(df,df_var,column_name):
    df = df.merge(df_var,on='Date',how='outer')
    Column_name = 'Value_' + str(column_name)
    #print(Column_name)
    df.rename({'Value':Column_name},axis=1,inplace=True)
    #print(df.head(6))
    print(df.shape)
    #print(df.columns)
    
    return df
In [17]:
var_list
Out[17]:
['SP500_DIV_YIELD_MONTH',
 'SP500_PE_RATIO_MONTH',
 'SHILLER_PE_RATIO_MONTH',
 'SP500_EARNINGS_YIELD_MONTH',
 'SP500_INFLADJ_MONTH',
 'SP500_PSR_QUARTER',
 'SP500_DIV_MONTH',
 'SP500_DIV_YEAR',
 'SP500_DIV_GROWTH_YEAR',
 'SP500_DIV_GROWTH_QUARTER',
 'SP500_PBV_RATIO_QUARTER',
 'SHILLER_PE_RATIO_YEAR',
 'SP500_PE_RATIO_YEAR',
 'SP500_DIV_YIELD_YEAR',
 'SP500_PSR_YEAR',
 'SP500_EARNINGS_YIELD_YEAR',
 'SP500_PBV_RATIO_YEAR',
 'SP500_INFLADJ_YEAR',
 'SP500_REAL_PRICE_MONTH',
 'SP500_SALES_YEAR',
 'SP500_SALES_GROWTH_YEAR',
 'SP500_SALES_QUARTER',
 'SP500_REAL_SALES_GROWTH_QUARTER',
 'SP500_SALES_GROWTH_QUARTER',
 'SP500_REAL_SALES_GROWTH_YEAR',
 'SP500_REAL_EARNINGS_GROWTH_YEAR',
 'SP500_REAL_SALES_YEAR',
 'SP500_REAL_EARNINGS_GROWTH_QUARTER',
 'SP500_EARNINGS_GROWTH_QUARTER',
 'SP500_REAL_SALES_QUARTER',
 'SP500_EARNINGS_MONTH',
 'SP500_BVPS_YEAR',
 'SP500_EARNINGS_YEAR',
 'SP500_EARNINGS_GROWTH_YEAR',
 'SP500_BVPS_QUARTER',
 'SP500_REAL_PRICE_YEAR']
In [18]:
df = Merge__Rename_function(df,SP500_DIV_GROWTH_YEAR,'SP500_DIV_GROWTH_YEAR')
df = Merge__Rename_function(df,SP500_DIV_GROWTH_QUARTER,'SP500_DIV_GROWTH_QUARTER')
df = Merge__Rename_function(df,SP500_PBV_RATIO_QUARTER,'SP500_PBV_RATIO_QUARTER')

df = Merge__Rename_function(df,SHILLER_PE_RATIO_YEAR,'SHILLER_PE_RATIO_YEAR')
df = Merge__Rename_function(df,SP500_PE_RATIO_YEAR,'SP500_PE_RATIO_YEAR')
df = Merge__Rename_function(df,SP500_DIV_YIELD_YEAR,'SP500_DIV_YIELD_YEAR')
df = Merge__Rename_function(df,SP500_PSR_YEAR,'SP500_PSR_YEAR')
df = Merge__Rename_function(df,SP500_EARNINGS_YIELD_YEAR,'SP500_EARNINGS_YIELD_YEAR')
df = Merge__Rename_function(df,SP500_PBV_RATIO_YEAR,'SP500_PBV_RATIO_YEAR')
df = Merge__Rename_function(df,SP500_INFLADJ_YEAR,'SP500_INFLADJ_YEAR')

df = Merge__Rename_function(df,SP500_SALES_YEAR,'SP500_SALES_YEAR')
df = Merge__Rename_function(df,SP500_SALES_GROWTH_YEAR,'SP500_SALES_GROWTH_YEAR')
df = Merge__Rename_function(df,SP500_SALES_QUARTER,'SP500_SALES_QUARTER')
df = Merge__Rename_function(df,SP500_REAL_SALES_GROWTH_QUARTER,'SP500_REAL_SALES_GROWTH_QUARTER')
df = Merge__Rename_function(df,SP500_SALES_GROWTH_QUARTER,'SP500_SALES_GROWTH_QUARTER')
df = Merge__Rename_function(df,SP500_REAL_SALES_GROWTH_YEAR,'SP500_REAL_SALES_GROWTH_YEAR')
df = Merge__Rename_function(df,SP500_REAL_EARNINGS_GROWTH_YEAR,'SP500_REAL_EARNINGS_GROWTH_YEAR')
df = Merge__Rename_function(df,SP500_REAL_SALES_YEAR,'SP500_REAL_SALES_YEAR')

df = Merge__Rename_function(df,SP500_REAL_EARNINGS_GROWTH_QUARTER,var_list[27])
df = Merge__Rename_function(df,SP500_EARNINGS_GROWTH_QUARTER,var_list[28])
df = Merge__Rename_function(df,SP500_REAL_SALES_QUARTER,var_list[29])
df = Merge__Rename_function(df,SP500_EARNINGS_MONTH,var_list[30])
df = Merge__Rename_function(df,SP500_BVPS_YEAR,var_list[31])
df = Merge__Rename_function(df,SP500_EARNINGS_YEAR,var_list[32])
df = Merge__Rename_function(df,SP500_EARNINGS_GROWTH_YEAR,var_list[33])
df = Merge__Rename_function(df,SP500_BVPS_QUARTER,var_list[34])
#df = Merge__Rename_function(df,SP500_REAL_PRICE_YEAR,var_list[35])

df.columns
(3553, 10)
(3559, 11)
(3559, 12)
(3559, 13)
(3559, 14)
(3559, 15)
(3559, 16)
(3559, 17)
(3559, 18)
(3559, 19)
(3559, 20)
(3559, 21)
(3559, 22)
(3559, 23)
(3559, 24)
(3559, 25)
(3559, 26)
(3559, 27)
(3559, 28)
(3559, 29)
(3559, 30)
(3559, 31)
(3559, 32)
(3559, 33)
(3559, 34)
(3559, 35)
Out[18]:
Index(['Value_SP500_REAL_PRICE_MONTH', 'Value_SP500_DIV_YIELD_MONTH',
       'Value_SP500_PE_RATIO_MONTH', 'Value_SHILLER_PE_RATIO_MONTH',
       'Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_INFLADJ_MONTH',
       'Value_SP500_PSR_QUARTER', 'Value_SP500_DIV_MONTH',
       'Value_SP500_DIV_YEAR', 'Value_SP500_DIV_GROWTH_YEAR',
       'Value_SP500_DIV_GROWTH_QUARTER', 'Value_SP500_PBV_RATIO_QUARTER',
       'Value_SHILLER_PE_RATIO_YEAR', 'Value_SP500_PE_RATIO_YEAR',
       'Value_SP500_DIV_YIELD_YEAR', 'Value_SP500_PSR_YEAR',
       'Value_SP500_EARNINGS_YIELD_YEAR', 'Value_SP500_PBV_RATIO_YEAR',
       'Value_SP500_INFLADJ_YEAR', 'Value_SP500_SALES_YEAR',
       'Value_SP500_SALES_GROWTH_YEAR', 'Value_SP500_SALES_QUARTER',
       'Value_SP500_REAL_SALES_GROWTH_QUARTER',
       'Value_SP500_SALES_GROWTH_QUARTER',
       'Value_SP500_REAL_SALES_GROWTH_YEAR',
       'Value_SP500_REAL_EARNINGS_GROWTH_YEAR', 'Value_SP500_REAL_SALES_YEAR',
       'Value_SP500_REAL_EARNINGS_GROWTH_QUARTER',
       'Value_SP500_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_SALES_QUARTER',
       'Value_SP500_EARNINGS_MONTH', 'Value_SP500_BVPS_YEAR',
       'Value_SP500_EARNINGS_YEAR', 'Value_SP500_EARNINGS_GROWTH_YEAR',
       'Value_SP500_BVPS_QUARTER'],
      dtype='object')
In [19]:
df.head()
Out[19]:
Value_SP500_REAL_PRICE_MONTH Value_SP500_DIV_YIELD_MONTH Value_SP500_PE_RATIO_MONTH Value_SHILLER_PE_RATIO_MONTH Value_SP500_EARNINGS_YIELD_MONTH Value_SP500_INFLADJ_MONTH Value_SP500_PSR_QUARTER Value_SP500_DIV_MONTH Value_SP500_DIV_YEAR Value_SP500_DIV_GROWTH_YEAR ... Value_SP500_REAL_EARNINGS_GROWTH_YEAR Value_SP500_REAL_SALES_YEAR Value_SP500_REAL_EARNINGS_GROWTH_QUARTER Value_SP500_EARNINGS_GROWTH_QUARTER Value_SP500_REAL_SALES_QUARTER Value_SP500_EARNINGS_MONTH Value_SP500_BVPS_YEAR Value_SP500_EARNINGS_YEAR Value_SP500_EARNINGS_GROWTH_YEAR Value_SP500_BVPS_QUARTER
Date
1871-01-01 4.44 NaN 11.10 NaN 9.01 89.81 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1871-01-31 NaN 5.86 NaN NaN NaN NaN NaN 5.26 5.15 NaN ... NaN NaN NaN NaN NaN 8.09 NaN 7.92 NaN NaN
1871-02-01 4.50 NaN 11.25 10.92 8.89 88.33 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1871-02-28 NaN 5.78 NaN NaN NaN NaN NaN 5.10 NaN NaN ... NaN NaN NaN NaN NaN 7.85 NaN NaN NaN NaN
1871-03-01 4.61 NaN 11.52 11.19 8.68 89.17 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 35 columns

In [20]:
#Pandas profiling of df
pandas_profiling.ProfileReport(df)
Out[20]:

Overview

Dataset info

Number of variables 36
Number of observations 3559
Total Missing (%) 15.2%
Total size in memory 1001.0 KiB
Average record size in memory 288.0 B

Variables types

Numeric 8
Categorical 0
Boolean 0
Date 1
Text (Unique) 0
Rejected 27
Unsupported 0

Warnings

Variables

Date
Date

Distinct count 3559
Unique (%) 100.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Minimum 1871-01-01 00:00:00
Maximum 2018-12-31 00:00:00

Value_SHILLER_PE_RATIO_MONTH
Numeric

Distinct count 1189
Unique (%) 33.4%
Missing (%) 50.2%
Missing (n) 1787
Infinite (%) 0.0%
Infinite (n) 0
Mean 16.57
Minimum 4.78
Maximum 44.19
Zeros (%) 0.0%

Quantile statistics

Minimum 4.78
5-th percentile 7.982
Q1 11.627
Median 15.67
Q3 20.15
95-th percentile 27.901
Maximum 44.19
Range 39.41
Interquartile range 8.5225

Descriptive statistics

Standard deviation 6.6713
Coef of variation 0.40262
Kurtosis 1.99
Mean 16.57
MAD 5.1052
Skewness 1.1235
Sum 29362
Variance 44.506
Memory size 27.9 KiB
Value Count Frequency (%)  
11.34 6 0.2%
 
17.82 6 0.2%
 
16.6 5 0.1%
 
12.05 5 0.1%
 
13.8 5 0.1%
 
17.05 4 0.1%
 
18.2 4 0.1%
 
15.27 4 0.1%
 
16.16 4 0.1%
 
15.47 4 0.1%
 
Other values (1178) 1725 48.5%
 
(Missing) 1787 50.2%
 

Minimum 5 values

Value Count Frequency (%)  
4.78 1 0.0%
 
5.02 1 0.0%
 
5.04 1 0.0%
 
5.08 1 0.0%
 
5.12 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
43.22 1 0.0%
 
43.53 1 0.0%
 
43.77 1 0.0%
 
43.83 1 0.0%
 
44.19 1 0.0%
 

Value_SHILLER_PE_RATIO_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PBV_RATIO_QUARTER and should be ignored for analysis

Correlation 1

Value_SP500_BVPS_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_BVPS_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_BVPS_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis

Correlation 0.9637

Value_SP500_DIV_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_DIV_GROWTH_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_DIV_GROWTH_YEAR
Numeric

Distinct count 34
Unique (%) 1.0%
Missing (%) 99.0%
Missing (n) 3522
Infinite (%) 0.0%
Infinite (n) 0
Mean 6.5478
Minimum -21.07
Maximum 18.25
Zeros (%) 0.0%

Quantile statistics

Minimum -21.07
5-th percentile -2.668
Q1 3.07
Median 7.07
Q3 11.45
95-th percentile 16.26
Maximum 18.25
Range 39.32
Interquartile range 8.38

Descriptive statistics

Standard deviation 6.9429
Coef of variation 1.0603
Kurtosis 5.896
Mean 6.5478
MAD 4.8668
Skewness -1.6172
Sum 242.27
Variance 48.204
Memory size 27.9 KiB
Value Count Frequency (%)  
7.99 2 0.1%
 
5.33 2 0.1%
 
7.07 2 0.1%
 
16.26 2 0.1%
 
9.33 1 0.0%
 
1.49 1 0.0%
 
8.16 1 0.0%
 
12.01 1 0.0%
 
10.0 1 0.0%
 
11.99 1 0.0%
 
Other values (23) 23 0.6%
 
(Missing) 3522 99.0%
 

Minimum 5 values

Value Count Frequency (%)  
-21.07 1 0.0%
 
-3.26 1 0.0%
 
-2.52 1 0.0%
 
0.97 1 0.0%
 
1.45 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
12.72 1 0.0%
 
13.38 1 0.0%
 
14.27 1 0.0%
 
16.26 2 0.1%
 
18.25 1 0.0%
 

Value_SP500_DIV_MONTH
Numeric

Distinct count 1119
Unique (%) 31.4%
Missing (%) 50.2%
Missing (n) 1786
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.415
Minimum 5.03
Maximum 52.26
Zeros (%) 0.0%

Quantile statistics

Minimum 5.03
5-th percentile 6.7
Q1 8.61
Median 12.88
Q3 19.6
95-th percentile 32.694
Maximum 52.26
Range 47.23
Interquartile range 10.99

Descriptive statistics

Standard deviation 8.7887
Coef of variation 0.57013
Kurtosis 3.0931
Mean 15.415
MAD 6.7561
Skewness 1.5803
Sum 27331
Variance 77.241
Memory size 27.9 KiB
Value Count Frequency (%)  
7.19 9 0.3%
 
7.29 8 0.2%
 
8.09 7 0.2%
 
6.81 7 0.2%
 
7.11 6 0.2%
 
8.4 6 0.2%
 
8.95 6 0.2%
 
7.22 6 0.2%
 
9.11 5 0.1%
 
7.71 5 0.1%
 
Other values (1108) 1708 48.0%
 
(Missing) 1786 50.2%
 

Minimum 5 values

Value Count Frequency (%)  
5.03 1 0.0%
 
5.04 1 0.0%
 
5.1 1 0.0%
 
5.17 2 0.1%
 
5.18 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
50.75 1 0.0%
 
51.0 1 0.0%
 
51.45 1 0.0%
 
51.87 1 0.0%
 
52.26 1 0.0%
 

Value_SP500_DIV_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_MONTH and should be ignored for analysis

Correlation 0.99996

Value_SP500_DIV_YIELD_MONTH
Numeric

Distinct count 608
Unique (%) 17.1%
Missing (%) 50.1%
Missing (n) 1782
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.3421
Minimum 1.11
Maximum 13.84
Zeros (%) 0.0%

Quantile statistics

Minimum 1.11
5-th percentile 1.74
Q1 3.15
Median 4.28
Q3 5.39
95-th percentile 7.18
Maximum 13.84
Range 12.73
Interquartile range 2.24

Descriptive statistics

Standard deviation 1.7025
Coef of variation 0.39208
Kurtosis 0.83756
Mean 4.3421
MAD 1.3453
Skewness 0.48864
Sum 7715.9
Variance 2.8984
Memory size 27.9 KiB
Value Count Frequency (%)  
4.55 10 0.3%
 
4.22 10 0.3%
 
5.18 10 0.3%
 
5.22 10 0.3%
 
4.43 9 0.3%
 
3.53 9 0.3%
 
1.76 9 0.3%
 
3.87 8 0.2%
 
4.69 8 0.2%
 
2.9 8 0.2%
 
Other values (597) 1686 47.4%
 
(Missing) 1782 50.1%
 

Minimum 5 values

Value Count Frequency (%)  
1.11 2 0.1%
 
1.13 1 0.0%
 
1.14 1 0.0%
 
1.15 1 0.0%
 
1.16 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
10.15 1 0.0%
 
11.36 1 0.0%
 
12.46 1 0.0%
 
12.64 1 0.0%
 
13.84 1 0.0%
 

Value_SP500_DIV_YIELD_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_EARNINGS_YIELD_MONTH and should be ignored for analysis

Correlation 1

Value_SP500_EARNINGS_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.99905

Value_SP500_EARNINGS_GROWTH_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_EARNINGS_GROWTH_QUARTER and should be ignored for analysis

Correlation 1

Value_SP500_EARNINGS_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YEAR and should be ignored for analysis

Correlation 0.94144

Value_SP500_EARNINGS_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_EARNINGS_MONTH and should be ignored for analysis

Correlation 0.99996

Value_SP500_EARNINGS_YIELD_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YIELD_MONTH and should be ignored for analysis

Correlation 1

Value_SP500_EARNINGS_YIELD_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YIELD_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_INFLADJ_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_REAL_PRICE_MONTH and should be ignored for analysis

Correlation 0.96721

Value_SP500_INFLADJ_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PBV_RATIO_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_PBV_RATIO_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_INFLADJ_MONTH and should be ignored for analysis

Correlation 1

Value_SP500_PBV_RATIO_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PE_RATIO_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_PE_RATIO_MONTH
Numeric

Distinct count 1117
Unique (%) 31.4%
Missing (%) 50.2%
Missing (n) 1786
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.725
Minimum 5.31
Maximum 123.73
Zeros (%) 0.0%

Quantile statistics

Minimum 5.31
5-th percentile 7.83
Q1 11.45
Median 14.73
Q3 18.04
95-th percentile 25.798
Maximum 123.73
Range 118.42
Interquartile range 6.59

Descriptive statistics

Standard deviation 8.4055
Coef of variation 0.53453
Kurtosis 67.915
Mean 15.725
MAD 4.6369
Skewness 6.4631
Sum 27880
Variance 70.653
Memory size 27.9 KiB
Value Count Frequency (%)  
15.61 6 0.2%
 
19.0 5 0.1%
 
13.82 5 0.1%
 
9.84 5 0.1%
 
7.97 5 0.1%
 
11.48 5 0.1%
 
12.21 5 0.1%
 
17.48 5 0.1%
 
17.83 5 0.1%
 
12.56 4 0.1%
 
Other values (1106) 1723 48.4%
 
(Missing) 1786 50.2%
 

Minimum 5 values

Value Count Frequency (%)  
5.31 1 0.0%
 
5.41 1 0.0%
 
5.74 1 0.0%
 
5.81 1 0.0%
 
5.82 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
101.87 1 0.0%
 
110.37 1 0.0%
 
119.85 1 0.0%
 
123.32 1 0.0%
 
123.73 1 0.0%
 

Value_SP500_PE_RATIO_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PBV_RATIO_QUARTER and should be ignored for analysis

Correlation 1

Value_SP500_PSR_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_INFLADJ_MONTH and should be ignored for analysis

Correlation 1

Value_SP500_PSR_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PE_RATIO_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_REAL_EARNINGS_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_REAL_EARNINGS_GROWTH_YEAR
Numeric

Distinct count 36
Unique (%) 1.0%
Missing (%) 99.0%
Missing (n) 3522
Infinite (%) 0.0%
Infinite (n) 0
Mean 12.684
Minimum -79.48
Maximum 261.66
Zeros (%) 0.0%

Quantile statistics

Minimum -79.48
5-th percentile -32.784
Q1 0.3
Median 10.69
Q3 14.93
95-th percentile 54.266
Maximum 261.66
Range 341.14
Interquartile range 14.63

Descriptive statistics

Standard deviation 48.913
Coef of variation 3.8564
Kurtosis 19.396
Mean 12.684
MAD 21.87
Skewness 3.637
Sum 469.29
Variance 2392.5
Memory size 27.9 KiB
Value Count Frequency (%)  
8.86 2 0.1%
 
7.81 2 0.1%
 
-6.49 1 0.0%
 
15.9 1 0.0%
 
-11.6 1 0.0%
 
-79.48 1 0.0%
 
10.69 1 0.0%
 
11.38 1 0.0%
 
0.36 1 0.0%
 
-51.84 1 0.0%
 
Other values (25) 25 0.7%
 
(Missing) 3522 99.0%
 

Minimum 5 values

Value Count Frequency (%)  
-79.48 1 0.0%
 
-51.84 1 0.0%
 
-28.02 1 0.0%
 
-21.05 1 0.0%
 
-15.56 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
24.85 1 0.0%
 
36.07 1 0.0%
 
49.72 1 0.0%
 
72.45 1 0.0%
 
261.66 1 0.0%
 

Value_SP500_REAL_PRICE_MONTH
Numeric

Distinct count 1401
Unique (%) 39.4%
Missing (%) 50.2%
Missing (n) 1788
Infinite (%) 0.0%
Infinite (n) 0
Mean 262.57
Minimum 2.73
Maximum 2789.8
Zeros (%) 0.0%

Quantile statistics

Minimum 2.73
5-th percentile 4.4
Q1 7.745
Median 16.5
Q3 123.65
95-th percentile 1420.4
Maximum 2789.8
Range 2787.1
Interquartile range 115.91

Descriptive statistics

Standard deviation 523.67
Coef of variation 1.9944
Kurtosis 5.389
Mean 262.57
MAD 359.68
Skewness 2.4198
Sum 465010
Variance 274230
Memory size 27.9 KiB
Value Count Frequency (%)  
4.37 7 0.2%
 
4.46 7 0.2%
 
5.3 6 0.2%
 
7.68 6 0.2%
 
5.18 6 0.2%
 
5.32 6 0.2%
 
4.34 5 0.1%
 
4.38 5 0.1%
 
8.12 5 0.1%
 
4.65 5 0.1%
 
Other values (1390) 1713 48.1%
 
(Missing) 1788 50.2%
 

Minimum 5 values

Value Count Frequency (%)  
2.73 1 0.0%
 
2.85 1 0.0%
 
2.94 2 0.1%
 
3.05 1 0.0%
 
3.17 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2702.77 1 0.0%
 
2705.16 1 0.0%
 
2736.61 1 0.0%
 
2754.35 1 0.0%
 
2789.8 1 0.0%
 

Value_SP500_REAL_SALES_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_SALES_GROWTH_YEAR and should be ignored for analysis

Correlation 0.99159

Value_SP500_REAL_SALES_GROWTH_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.99159

Value_SP500_REAL_SALES_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_YEAR and should be ignored for analysis

Correlation 0.99714

Value_SP500_REAL_SALES_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis

Correlation 0.9333

Value_SP500_SALES_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.99183

Value_SP500_SALES_GROWTH_YEAR
Numeric

Distinct count 22
Unique (%) 0.6%
Missing (%) 99.3%
Missing (n) 3534
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.2676
Minimum -12.86
Maximum 10.93
Zeros (%) 0.0%

Quantile statistics

Minimum -12.86
5-th percentile -7.382
Q1 2.09
Median 5.68
Q3 7.68
95-th percentile 10.576
Maximum 10.93
Range 23.79
Interquartile range 5.59

Descriptive statistics

Standard deviation 5.775
Coef of variation 1.3532
Kurtosis 2.4996
Mean 4.2676
MAD 4.2259
Skewness -1.5198
Sum 106.69
Variance 33.351
Memory size 27.9 KiB
Value Count Frequency (%)  
2.09 2 0.1%
 
9.36 2 0.1%
 
7.68 2 0.1%
 
7.03 2 0.1%
 
1.7 1 0.0%
 
5.68 1 0.0%
 
5.37 1 0.0%
 
10.93 1 0.0%
 
4.16 1 0.0%
 
-1.18 1 0.0%
 
Other values (11) 11 0.3%
 
(Missing) 3534 99.3%
 

Minimum 5 values

Value Count Frequency (%)  
-12.86 1 0.0%
 
-8.45 1 0.0%
 
-3.11 1 0.0%
 
-1.18 1 0.0%
 
1.7 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
8.94 1 0.0%
 
9.03 1 0.0%
 
9.36 2 0.1%
 
10.88 1 0.0%
 
10.93 1 0.0%
 

Value_SP500_SALES_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_SALES_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_SALES_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YEAR and should be ignored for analysis

Correlation 0.92834

Correlations

Sample

Value_SP500_REAL_PRICE_MONTH Value_SP500_DIV_YIELD_MONTH Value_SP500_PE_RATIO_MONTH Value_SHILLER_PE_RATIO_MONTH Value_SP500_EARNINGS_YIELD_MONTH Value_SP500_INFLADJ_MONTH Value_SP500_PSR_QUARTER Value_SP500_DIV_MONTH Value_SP500_DIV_YEAR Value_SP500_DIV_GROWTH_YEAR Value_SP500_DIV_GROWTH_QUARTER Value_SP500_PBV_RATIO_QUARTER Value_SHILLER_PE_RATIO_YEAR Value_SP500_PE_RATIO_YEAR Value_SP500_DIV_YIELD_YEAR Value_SP500_PSR_YEAR Value_SP500_EARNINGS_YIELD_YEAR Value_SP500_PBV_RATIO_YEAR Value_SP500_INFLADJ_YEAR Value_SP500_SALES_YEAR Value_SP500_SALES_GROWTH_YEAR Value_SP500_SALES_QUARTER Value_SP500_REAL_SALES_GROWTH_QUARTER Value_SP500_SALES_GROWTH_QUARTER Value_SP500_REAL_SALES_GROWTH_YEAR Value_SP500_REAL_EARNINGS_GROWTH_YEAR Value_SP500_REAL_SALES_YEAR Value_SP500_REAL_EARNINGS_GROWTH_QUARTER Value_SP500_EARNINGS_GROWTH_QUARTER Value_SP500_REAL_SALES_QUARTER Value_SP500_EARNINGS_MONTH Value_SP500_BVPS_YEAR Value_SP500_EARNINGS_YEAR Value_SP500_EARNINGS_GROWTH_YEAR Value_SP500_BVPS_QUARTER
Date
1871-01-01 4.44 NaN 11.10 NaN 9.01 89.81 NaN NaN NaN NaN NaN NaN NaN 11.1 NaN NaN 9.01 NaN 89.81 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1871-01-31 NaN 5.86 NaN NaN NaN NaN NaN 5.26 5.15 NaN NaN NaN NaN NaN 5.86 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.09 NaN 7.92 NaN NaN
1871-02-01 4.50 NaN 11.25 10.92 8.89 88.33 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1871-02-28 NaN 5.78 NaN NaN NaN NaN NaN 5.10 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.85 NaN NaN NaN NaN
1871-03-01 4.61 NaN 11.52 11.19 8.68 89.17 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [21]:
def Column_missing_values(df):
    '''
    Function to find missing values or NaN values per column and print percentage of missing values per column
    in bar chart.
    IN - any pandas dataframe 
    '''
    #Percentage of missing values per column
    missing_columns = df.isnull().sum()
    if missing_columns.values.any() != 0:
        missing_columns = (missing_columns[missing_columns>0]/df.shape[0]) * 100
        missing_columns.sort_values(inplace=True)
        missing_columns.plot.bar(title = 'Column wise percentage missing values', figsize=(8,4))
    else:
        print('No missing values in provided dataframe')
        
    return
In [22]:
Column_missing_values(df)
In [23]:
#Correlation plot for numerical encoded version of API data in df.
sns.heatmap(df.corr(),annot=None,fmt='.2f',square=False)
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e5362196a0>
In [24]:
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(df)
plt.show()
In [25]:
#Drop last rows with NaN values for Value_SP500_REAL_PRICE_MONTH because these were outlier with no price values.
df.drop(df.tail(11).index,axis=0,inplace=True)

Impute and Interpolate missing values

Original dataset has many missing or NaN values which is noise to end results. General methods like replacing with mean values can not be applied because it would add bias to the timeseries dataset.

In [26]:
import  impyute.imputation.cs 
import impyute.imputation.ts
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer, MissingIndicator
from statsmodels.tsa.arima_model import ARIMA
from sklearn.model_selection import cross_val_score
In [27]:
#Create an imputation object
imputer_most_frequent= SimpleImputer(missing_values=np.nan,strategy ='most_frequent')
#Inject imputed values in the dataset.
df_imputed = pd.DataFrame(imputer_most_frequent.fit_transform(df))
df_imputed.columns = df.columns
df_imputed.index = df.index
In [28]:
pandas_profiling.ProfileReport(df_imputed)
Out[28]:

Overview

Dataset info

Number of variables 36
Number of observations 3548
Total Missing (%) 0.0%
Total size in memory 998.0 KiB
Average record size in memory 288.0 B

Variables types

Numeric 25
Categorical 0
Boolean 0
Date 1
Text (Unique) 0
Rejected 10
Unsupported 0

Warnings

Variables

Date
Date

Distinct count 3548
Unique (%) 100.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Minimum 1871-01-01 00:00:00
Maximum 2018-12-31 00:00:00

Value_SHILLER_PE_RATIO_MONTH
Numeric

Distinct count 1188
Unique (%) 33.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 13.952
Minimum 4.78
Maximum 44.19
Zeros (%) 0.0%

Quantile statistics

Minimum 4.78
5-th percentile 9.26
Q1 11.34
Median 11.34
Q3 15.66
95-th percentile 25.41
Maximum 44.19
Range 39.41
Interquartile range 4.32

Descriptive statistics

Standard deviation 5.3909
Coef of variation 0.38639
Kurtosis 6.0424
Mean 13.952
MAD 3.9298
Skewness 2.1774
Sum 49501
Variance 29.061
Memory size 27.8 KiB
Value Count Frequency (%)  
11.34 1782 50.2%
 
17.82 6 0.2%
 
13.8 5 0.1%
 
16.6 5 0.1%
 
12.05 5 0.1%
 
16.83 4 0.1%
 
10.0 4 0.1%
 
10.91 4 0.1%
 
15.47 4 0.1%
 
18.96 4 0.1%
 
Other values (1178) 1725 48.6%
 

Minimum 5 values

Value Count Frequency (%)  
4.78 1 0.0%
 
5.02 1 0.0%
 
5.04 1 0.0%
 
5.08 1 0.0%
 
5.12 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
43.22 1 0.0%
 
43.53 1 0.0%
 
43.77 1 0.0%
 
43.83 1 0.0%
 
44.19 1 0.0%
 

Value_SHILLER_PE_RATIO_YEAR
Numeric

Distinct count 141
Unique (%) 4.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 12.108
Minimum 5.12
Maximum 43.77
Zeros (%) 0.0%

Quantile statistics

Minimum 5.12
5-th percentile 11.9
Q1 11.9
Median 11.9
Q3 11.9
95-th percentile 11.9
Maximum 43.77
Range 38.65
Interquartile range 0

Descriptive statistics

Standard deviation 1.7274
Coef of variation 0.14267
Kurtosis 112.09
Mean 12.108
MAD 0.45349
Skewness 9.3367
Sum 42958
Variance 2.9838
Memory size 27.8 KiB
Value Count Frequency (%)  
11.9 3402 95.9%
 
17.22 2 0.1%
 
9.26 2 0.1%
 
17.09 2 0.1%
 
18.47 2 0.1%
 
20.32 2 0.1%
 
13.9 2 0.1%
 
27.21 1 0.0%
 
22.9 1 0.0%
 
20.97 1 0.0%
 
Other values (131) 131 3.7%
 

Minimum 5 values

Value Count Frequency (%)  
5.12 1 0.0%
 
5.99 1 0.0%
 
6.1 1 0.0%
 
6.29 1 0.0%
 
6.64 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
33.31 1 0.0%
 
33.36 1 0.0%
 
36.98 1 0.0%
 
40.57 1 0.0%
 
43.77 1 0.0%
 

Value_SP500_BVPS_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_QUARTER and should be ignored for analysis

Correlation 0.951

Value_SP500_BVPS_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_YEAR and should be ignored for analysis

Correlation 0.9145

Value_SP500_DIV_GROWTH_QUARTER
Numeric

Distinct count 113
Unique (%) 3.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 11.696
Minimum -21.07
Maximum 18.25
Zeros (%) 0.0%

Quantile statistics

Minimum -21.07
5-th percentile 11.89
Q1 11.89
Median 11.89
Q3 11.89
95-th percentile 11.89
Maximum 18.25
Range 39.32
Interquartile range 0

Descriptive statistics

Standard deviation 1.6985
Coef of variation 0.14523
Kurtosis 144.27
Mean 11.696
MAD 0.4133
Skewness -10.499
Sum 41496
Variance 2.8849
Memory size 27.8 KiB
Value Count Frequency (%)  
11.89 3434 96.8%
 
12.65 2 0.1%
 
12.49 2 0.1%
 
10.46 1 0.0%
 
18.25 1 0.0%
 
5.05 1 0.0%
 
1.55 1 0.0%
 
-4.71 1 0.0%
 
1.26 1 0.0%
 
8.02 1 0.0%
 
Other values (103) 103 2.9%
 

Minimum 5 values

Value Count Frequency (%)  
-21.07 1 0.0%
 
-19.63 1 0.0%
 
-17.17 1 0.0%
 
-13.9 1 0.0%
 
-10.86 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
16.74 1 0.0%
 
17.4 1 0.0%
 
17.47 1 0.0%
 
17.51 1 0.0%
 
18.25 1 0.0%
 

Value_SP500_DIV_GROWTH_YEAR
Numeric

Distinct count 32
Unique (%) 0.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -20.824
Minimum -21.07
Maximum 18.25
Zeros (%) 0.0%

Quantile statistics

Minimum -21.07
5-th percentile -21.07
Q1 -21.07
Median -21.07
Q3 -21.07
95-th percentile -21.07
Maximum 18.25
Range 39.32
Interquartile range 0

Descriptive statistics

Standard deviation 2.6645
Coef of variation -0.12795
Kurtosis 125.17
Mean -20.824
MAD 0.48728
Skewness 11.107
Sum -73884
Variance 7.0994
Memory size 27.8 KiB
Value Count Frequency (%)  
-21.07 3517 99.1%
 
6.99 1 0.0%
 
3.07 1 0.0%
 
10.0 1 0.0%
 
1.49 1 0.0%
 
9.33 1 0.0%
 
12.01 1 0.0%
 
11.99 1 0.0%
 
0.97 1 0.0%
 
2.12 1 0.0%
 
Other values (22) 22 0.6%
 

Minimum 5 values

Value Count Frequency (%)  
-21.07 3517 99.1%
 
-3.26 1 0.0%
 
-2.52 1 0.0%
 
0.97 1 0.0%
 
1.45 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
12.72 1 0.0%
 
13.38 1 0.0%
 
14.27 1 0.0%
 
16.26 1 0.0%
 
18.25 1 0.0%
 

Value_SP500_DIV_MONTH
Numeric

Distinct count 1118
Unique (%) 31.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 11.3
Minimum 5.03
Maximum 52.26
Zeros (%) 0.0%

Quantile statistics

Minimum 5.03
5-th percentile 7.11
Q1 7.19
Median 7.19
Q3 12.843
95-th percentile 24.97
Maximum 52.26
Range 47.23
Interquartile range 5.6525

Descriptive statistics

Standard deviation 7.4502
Coef of variation 0.6593
Kurtosis 7.1196
Mean 11.3
MAD 5.3875
Skewness 2.448
Sum 40093
Variance 55.506
Memory size 27.8 KiB
Value Count Frequency (%)  
7.19 1784 50.3%
 
7.29 8 0.2%
 
6.81 7 0.2%
 
8.09 7 0.2%
 
8.4 6 0.2%
 
8.95 6 0.2%
 
7.22 6 0.2%
 
7.11 6 0.2%
 
6.94 5 0.1%
 
7.05 5 0.1%
 
Other values (1108) 1708 48.1%
 

Minimum 5 values

Value Count Frequency (%)  
5.03 1 0.0%
 
5.04 1 0.0%
 
5.1 1 0.0%
 
5.17 2 0.1%
 
5.18 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
50.75 1 0.0%
 
51.0 1 0.0%
 
51.45 1 0.0%
 
51.87 1 0.0%
 
52.26 1 0.0%
 

Value_SP500_DIV_YEAR
Numeric

Distinct count 146
Unique (%) 4.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 7.1088
Minimum 5.04
Maximum 52.26
Zeros (%) 0.0%

Quantile statistics

Minimum 5.04
5-th percentile 6.7
Q1 6.7
Median 6.7
Q3 6.7
95-th percentile 6.7
Maximum 52.26
Range 47.22
Interquartile range 0

Descriptive statistics

Standard deviation 2.8765
Coef of variation 0.40463
Kurtosis 118.18
Mean 7.1088
MAD 0.79031
Skewness 9.9101
Sum 25222
Variance 8.2741
Memory size 27.8 KiB
Value Count Frequency (%)  
6.7 3398 95.8%
 
7.11 2 0.1%
 
6.81 2 0.1%
 
7.61 2 0.1%
 
15.89 2 0.1%
 
12.17 2 0.1%
 
24.21 1 0.0%
 
9.18 1 0.0%
 
8.07 1 0.0%
 
16.97 1 0.0%
 
Other values (136) 136 3.8%
 

Minimum 5 values

Value Count Frequency (%)  
5.04 1 0.0%
 
5.15 1 0.0%
 
5.18 1 0.0%
 
5.2 1 0.0%
 
5.55 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
48.17 1 0.0%
 
50.0 1 0.0%
 
50.03 1 0.0%
 
51.02 1 0.0%
 
52.26 1 0.0%
 

Value_SP500_DIV_YIELD_MONTH
Numeric

Distinct count 607
Unique (%) 17.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.2812
Minimum 1.11
Maximum 13.84
Zeros (%) 0.0%

Quantile statistics

Minimum 1.11
5-th percentile 1.96
Q1 4.22
Median 4.22
Q3 4.28
95-th percentile 6.57
Maximum 13.84
Range 12.73
Interquartile range 0.06

Descriptive statistics

Standard deviation 1.2062
Coef of variation 0.28175
Kurtosis 4.7742
Mean 4.2812
MAD 0.70379
Skewness 0.83882
Sum 15190
Variance 1.455
Memory size 27.8 KiB
Value Count Frequency (%)  
4.22 1781 50.2%
 
5.22 10 0.3%
 
4.55 10 0.3%
 
5.18 10 0.3%
 
4.43 9 0.3%
 
3.53 9 0.3%
 
1.76 9 0.3%
 
4.17 8 0.2%
 
2.93 8 0.2%
 
4.1 8 0.2%
 
Other values (597) 1686 47.5%
 

Minimum 5 values

Value Count Frequency (%)  
1.11 2 0.1%
 
1.13 1 0.0%
 
1.14 1 0.0%
 
1.15 1 0.0%
 
1.16 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
10.15 1 0.0%
 
11.36 1 0.0%
 
12.46 1 0.0%
 
12.64 1 0.0%
 
13.84 1 0.0%
 

Value_SP500_DIV_YIELD_YEAR
Numeric

Distinct count 137
Unique (%) 3.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.0815
Minimum 1.17
Maximum 10.15
Zeros (%) 0.0%

Quantile statistics

Minimum 1.17
5-th percentile 4.07
Q1 4.07
Median 4.07
Q3 4.07
95-th percentile 4.07
Maximum 10.15
Range 8.98
Interquartile range 0

Descriptive statistics

Standard deviation 0.36379
Coef of variation 0.089132
Kurtosis 79.461
Mean 4.0815
MAD 0.069995
Skewness 4.2121
Sum 14481
Variance 0.13234
Memory size 27.8 KiB
Value Count Frequency (%)  
4.07 3400 95.8%
 
6.2 2 0.1%
 
5.41 2 0.1%
 
4.4 2 0.1%
 
3.49 2 0.1%
 
3.53 2 0.1%
 
3.81 2 0.1%
 
1.76 2 0.1%
 
1.61 2 0.1%
 
5.71 2 0.1%
 
Other values (127) 130 3.7%
 

Minimum 5 values

Value Count Frequency (%)  
1.17 1 0.0%
 
1.22 1 0.0%
 
1.36 1 0.0%
 
1.37 1 0.0%
 
1.61 2 0.1%
 

Maximum 5 values

Value Count Frequency (%)  
7.49 1 0.0%
 
8.11 1 0.0%
 
8.38 1 0.0%
 
9.72 1 0.0%
 
10.15 1 0.0%
 

Value_SP500_EARNINGS_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.99834

Value_SP500_EARNINGS_GROWTH_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_YEAR and should be ignored for analysis

Correlation 0.9994

Value_SP500_EARNINGS_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_DIV_MONTH and should be ignored for analysis

Correlation 0.94595

Value_SP500_EARNINGS_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YEAR and should be ignored for analysis

Correlation 0.94718

Value_SP500_EARNINGS_YIELD_MONTH
Numeric

Distinct count 792
Unique (%) 22.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 6.4929
Minimum 0.81
Maximum 18.82
Zeros (%) 0.0%

Quantile statistics

Minimum 0.81
5-th percentile 4.45
Q1 5.62
Median 5.62
Q3 6.79
95-th percentile 10.95
Maximum 18.82
Range 18.01
Interquartile range 1.17

Descriptive statistics

Standard deviation 2.1019
Coef of variation 0.32372
Kurtosis 5.3453
Mean 6.4929
MAD 1.4629
Skewness 2.0606
Sum 23037
Variance 4.418
Memory size 27.8 KiB
Value Count Frequency (%)  
5.62 1786 50.3%
 
5.72 10 0.3%
 
5.26 10 0.3%
 
5.3 9 0.3%
 
5.29 9 0.3%
 
5.61 8 0.2%
 
7.24 8 0.2%
 
5.53 8 0.2%
 
5.55 8 0.2%
 
5.54 8 0.2%
 
Other values (782) 1684 47.5%
 

Minimum 5 values

Value Count Frequency (%)  
0.81 2 0.1%
 
0.83 1 0.0%
 
0.91 1 0.0%
 
0.98 1 0.0%
 
1.08 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
17.18 1 0.0%
 
17.21 1 0.0%
 
17.42 1 0.0%
 
18.48 1 0.0%
 
18.82 1 0.0%
 

Value_SP500_EARNINGS_YIELD_YEAR
Numeric

Distinct count 137
Unique (%) 3.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 5.4126
Minimum 1.41
Maximum 17.42
Zeros (%) 0.0%

Quantile statistics

Minimum 1.41
5-th percentile 5.33
Q1 5.33
Median 5.33
Q3 5.33
95-th percentile 5.33
Maximum 17.42
Range 16.01
Interquartile range 0

Descriptive statistics

Standard deviation 0.67946
Coef of variation 0.12553
Kurtosis 105.43
Mean 5.4126
MAD 0.17808
Skewness 9.0727
Sum 19204
Variance 0.46167
Memory size 27.8 KiB
Value Count Frequency (%)  
5.33 3401 95.9%
 
5.53 3 0.1%
 
8.46 3 0.1%
 
5.55 2 0.1%
 
6.57 2 0.1%
 
7.42 2 0.1%
 
4.44 2 0.1%
 
7.35 2 0.1%
 
11.08 2 0.1%
 
9.87 2 0.1%
 
Other values (127) 127 3.6%
 

Minimum 5 values

Value Count Frequency (%)  
1.41 1 0.0%
 
2.17 1 0.0%
 
3.04 1 0.0%
 
3.18 1 0.0%
 
3.44 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
13.53 1 0.0%
 
13.84 1 0.0%
 
15.1 1 0.0%
 
15.77 1 0.0%
 
17.42 1 0.0%
 

Value_SP500_INFLADJ_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_REAL_PRICE_MONTH and should be ignored for analysis

Correlation 0.95138

Value_SP500_INFLADJ_YEAR
Numeric

Distinct count 150
Unique (%) 4.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 101.53
Minimum 81.79
Maximum 2914.2
Zeros (%) 0.0%

Quantile statistics

Minimum 81.79
5-th percentile 81.79
Q1 81.79
Median 81.79
Q3 81.79
95-th percentile 81.79
Maximum 2914.2
Range 2832.4
Interquartile range 0

Descriptive statistics

Standard deviation 157.9
Coef of variation 1.5552
Kurtosis 149.94
Mean 101.53
MAD 37.848
Skewness 11.449
Sum 360230
Variance 24932
Memory size 27.8 KiB
Value Count Frequency (%)  
81.79 3399 95.8%
 
178.12 1 0.0%
 
103.6 1 0.0%
 
739.63 1 0.0%
 
609.48 1 0.0%
 
188.58 1 0.0%
 
178.98 1 0.0%
 
176.02 1 0.0%
 
2041.06 1 0.0%
 
161.59 1 0.0%
 
Other values (140) 140 3.9%
 

Minimum 5 values

Value Count Frequency (%)  
81.79 3399 95.8%
 
88.75 1 0.0%
 
89.81 1 0.0%
 
94.32 1 0.0%
 
94.95 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2187.26 1 0.0%
 
2361.3 1 0.0%
 
2758.77 1 0.0%
 
2836.75 1 0.0%
 
2914.22 1 0.0%
 

Value_SP500_PBV_RATIO_QUARTER
Numeric

Distinct count 61
Unique (%) 1.7%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.5849
Minimum 1.78
Maximum 5.06
Zeros (%) 0.0%

Quantile statistics

Minimum 1.78
5-th percentile 2.58
Q1 2.58
Median 2.58
Q3 2.58
95-th percentile 2.58
Maximum 5.06
Range 3.28
Interquartile range 0

Descriptive statistics

Standard deviation 0.10134
Coef of variation 0.039203
Kurtosis 309.88
Mean 2.5849
MAD 0.015266
Skewness 14.136
Sum 9171.1
Variance 0.010269
Memory size 27.8 KiB
Value Count Frequency (%)  
2.58 3473 97.9%
 
2.91 3 0.1%
 
2.76 3 0.1%
 
2.77 3 0.1%
 
2.43 2 0.1%
 
3.03 2 0.1%
 
2.83 2 0.1%
 
2.67 2 0.1%
 
2.78 2 0.1%
 
2.74 2 0.1%
 
Other values (51) 54 1.5%
 

Minimum 5 values

Value Count Frequency (%)  
1.78 1 0.0%
 
1.85 1 0.0%
 
1.89 1 0.0%
 
1.9 1 0.0%
 
2.0 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
4.05 1 0.0%
 
4.49 1 0.0%
 
4.65 1 0.0%
 
5.05 1 0.0%
 
5.06 1 0.0%
 

Value_SP500_PBV_RATIO_YEAR
Numeric

Distinct count 19
Unique (%) 0.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.1744
Minimum 2
Maximum 5.05
Zeros (%) 0.0%

Quantile statistics

Minimum 2
5-th percentile 2.17
Q1 2.17
Median 2.17
Q3 2.17
95-th percentile 2.17
Maximum 5.05
Range 3.05
Interquartile range 0

Descriptive statistics

Standard deviation 0.078208
Coef of variation 0.035968
Kurtosis 682.74
Mean 2.1744
MAD 0.0089232
Skewness 23.544
Sum 7714.7
Variance 0.0061165
Memory size 27.8 KiB
Value Count Frequency (%)  
2.17 3529 99.5%
 
2.76 2 0.1%
 
2.14 1 0.0%
 
2.91 1 0.0%
 
3.5 1 0.0%
 
2.73 1 0.0%
 
2.77 1 0.0%
 
2.81 1 0.0%
 
2.58 1 0.0%
 
3.39 1 0.0%
 
Other values (9) 9 0.3%
 

Minimum 5 values

Value Count Frequency (%)  
2.0 1 0.0%
 
2.05 1 0.0%
 
2.14 1 0.0%
 
2.17 3529 99.5%
 
2.58 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
3.3 1 0.0%
 
3.39 1 0.0%
 
3.5 1 0.0%
 
4.05 1 0.0%
 
5.05 1 0.0%
 

Value_SP500_PE_RATIO_MONTH
Numeric

Distinct count 1116
Unique (%) 31.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.667
Minimum 5.31
Maximum 123.73
Zeros (%) 0.0%

Quantile statistics

Minimum 5.31
5-th percentile 9.13
Q1 14.73
Median 15.61
Q3 15.61
95-th percentile 22.479
Maximum 123.73
Range 118.42
Interquartile range 0.88

Descriptive statistics

Standard deviation 5.9414
Coef of variation 0.37922
Kurtosis 139.05
Mean 15.667
MAD 2.3416
Skewness 9.1666
Sum 55588
Variance 35.3
Memory size 27.8 KiB
Value Count Frequency (%)  
15.61 1781 50.2%
 
11.48 5 0.1%
 
12.21 5 0.1%
 
17.48 5 0.1%
 
9.84 5 0.1%
 
19.0 5 0.1%
 
17.83 5 0.1%
 
7.97 5 0.1%
 
13.82 5 0.1%
 
9.13 4 0.1%
 
Other values (1106) 1723 48.6%
 

Minimum 5 values

Value Count Frequency (%)  
5.31 1 0.0%
 
5.41 1 0.0%
 
5.74 1 0.0%
 
5.81 1 0.0%
 
5.82 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
101.87 1 0.0%
 
110.37 1 0.0%
 
119.85 1 0.0%
 
123.32 1 0.0%
 
123.73 1 0.0%
 

Value_SP500_PE_RATIO_YEAR
Numeric

Distinct count 142
Unique (%) 4.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 11.989
Minimum 5.74
Maximum 70.91
Zeros (%) 0.0%

Quantile statistics

Minimum 5.74
5-th percentile 11.82
Q1 11.82
Median 11.82
Q3 11.82
95-th percentile 11.82
Maximum 70.91
Range 65.17
Interquartile range 0

Descriptive statistics

Standard deviation 1.6921
Coef of variation 0.14114
Kurtosis 487.59
Mean 11.989
MAD 0.37918
Skewness 17.568
Sum 42537
Variance 2.8634
Memory size 27.8 KiB
Value Count Frequency (%)  
11.82 3401 95.9%
 
13.48 2 0.1%
 
18.01 2 0.1%
 
7.97 2 0.1%
 
9.02 2 0.1%
 
10.13 2 0.1%
 
18.77 2 0.1%
 
17.81 1 0.0%
 
19.33 1 0.0%
 
19.99 1 0.0%
 
Other values (132) 132 3.7%
 

Minimum 5 values

Value Count Frequency (%)  
5.74 1 0.0%
 
6.34 1 0.0%
 
6.62 1 0.0%
 
7.22 1 0.0%
 
7.39 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
29.04 1 0.0%
 
31.43 1 0.0%
 
32.92 1 0.0%
 
46.17 1 0.0%
 
70.91 1 0.0%
 

Value_SP500_PSR_QUARTER
Numeric

Distinct count 55
Unique (%) 1.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.432
Minimum 0.8
Maximum 2.31
Zeros (%) 0.0%

Quantile statistics

Minimum 0.8
5-th percentile 1.43
Q1 1.43
Median 1.43
Q3 1.43
95-th percentile 1.43
Maximum 2.31
Range 1.51
Interquartile range 0

Descriptive statistics

Standard deviation 0.048445
Coef of variation 0.03383
Kurtosis 154.51
Mean 1.432
MAD 0.0071731
Skewness 8.0394
Sum 5080.8
Variance 0.002347
Memory size 27.8 KiB
Value Count Frequency (%)  
1.43 3479 98.1%
 
1.44 3 0.1%
 
1.54 3 0.1%
 
1.31 3 0.1%
 
1.19 2 0.1%
 
1.77 2 0.1%
 
1.52 2 0.1%
 
1.27 2 0.1%
 
2.1 2 0.1%
 
1.66 2 0.1%
 
Other values (45) 48 1.4%
 

Minimum 5 values

Value Count Frequency (%)  
0.8 1 0.0%
 
0.87 1 0.0%
 
0.97 1 0.0%
 
1.08 1 0.0%
 
1.1 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2.1 2 0.1%
 
2.13 1 0.0%
 
2.17 1 0.0%
 
2.25 1 0.0%
 
2.31 1 0.0%
 

Value_SP500_PSR_YEAR
Numeric

Distinct count 16
Unique (%) 0.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.3116
Minimum 0.87
Maximum 2.31
Zeros (%) 0.0%

Quantile statistics

Minimum 0.87
5-th percentile 1.31
Q1 1.31
Median 1.31
Q3 1.31
95-th percentile 1.31
Maximum 2.31
Range 1.44
Interquartile range 0

Descriptive statistics

Standard deviation 0.033841
Coef of variation 0.025802
Kurtosis 506.53
Mean 1.3116
MAD 0.003505
Skewness 20.076
Sum 4653.5
Variance 0.0011452
Memory size 27.8 KiB
Value Count Frequency (%)  
1.31 3530 99.5%
 
1.56 2 0.1%
 
1.43 2 0.1%
 
1.77 2 0.1%
 
1.66 1 0.0%
 
0.87 1 0.0%
 
1.81 1 0.0%
 
2.13 1 0.0%
 
1.3 1 0.0%
 
1.95 1 0.0%
 
Other values (6) 6 0.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.87 1 0.0%
 
1.19 1 0.0%
 
1.23 1 0.0%
 
1.3 1 0.0%
 
1.31 3530 99.5%
 

Maximum 5 values

Value Count Frequency (%)  
1.81 1 0.0%
 
1.95 1 0.0%
 
2.13 1 0.0%
 
2.17 1 0.0%
 
2.31 1 0.0%
 

Value_SP500_REAL_EARNINGS_GROWTH_QUARTER
Numeric

Distinct count 114
Unique (%) 3.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 11.791
Minimum -90.27
Maximum 905.56
Zeros (%) 0.0%

Quantile statistics

Minimum -90.27
5-th percentile 11.38
Q1 11.38
Median 11.38
Q3 11.38
95-th percentile 11.38
Maximum 905.56
Range 995.83
Interquartile range 0

Descriptive statistics

Standard deviation 23.282
Coef of variation 1.9746
Kurtosis 1188.3
Mean 11.791
MAD 1.7031
Skewness 32.842
Sum 41833
Variance 542.04
Memory size 27.8 KiB
Value Count Frequency (%)  
11.38 3435 96.8%
 
17.8 1 0.0%
 
-7.72 1 0.0%
 
-31.14 1 0.0%
 
0.55 1 0.0%
 
-44.15 1 0.0%
 
11.18 1 0.0%
 
-21.05 1 0.0%
 
-86.8 1 0.0%
 
15.28 1 0.0%
 
Other values (104) 104 2.9%
 

Minimum 5 values

Value Count Frequency (%)  
-90.27 1 0.0%
 
-86.8 1 0.0%
 
-79.48 1 0.0%
 
-73.86 1 0.0%
 
-51.84 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
72.45 1 0.0%
 
261.66 1 0.0%
 
492.53 1 0.0%
 
870.4 1 0.0%
 
905.56 1 0.0%
 

Value_SP500_REAL_EARNINGS_GROWTH_YEAR
Numeric

Distinct count 33
Unique (%) 0.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -78.62
Minimum -79.48
Maximum 261.66
Zeros (%) 0.0%

Quantile statistics

Minimum -79.48
5-th percentile -79.48
Q1 -79.48
Median -79.48
Q3 -79.48
95-th percentile -79.48
Maximum 261.66
Range 341.14
Interquartile range 0

Descriptive statistics

Standard deviation 10.149
Coef of variation -0.12909
Kurtosis 419.42
Mean -78.62
MAD 1.7038
Skewness 17.108
Sum -278940
Variance 103.01
Memory size 27.8 KiB
Value Count Frequency (%)  
-79.48 3516 99.1%
 
17.8 1 0.0%
 
8.1 1 0.0%
 
261.66 1 0.0%
 
-28.02 1 0.0%
 
49.72 1 0.0%
 
0.36 1 0.0%
 
-6.49 1 0.0%
 
11.38 1 0.0%
 
10.69 1 0.0%
 
Other values (23) 23 0.6%
 

Minimum 5 values

Value Count Frequency (%)  
-79.48 3516 99.1%
 
-51.84 1 0.0%
 
-28.02 1 0.0%
 
-21.05 1 0.0%
 
-15.56 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
24.85 1 0.0%
 
36.07 1 0.0%
 
49.72 1 0.0%
 
72.45 1 0.0%
 
261.66 1 0.0%
 

Value_SP500_REAL_PRICE_MONTH
Numeric

Distinct count 1400
Unique (%) 39.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 133.25
Minimum 2.73
Maximum 2789.8
Zeros (%) 0.0%

Quantile statistics

Minimum 2.73
5-th percentile 4.37
Q1 4.37
Median 4.37
Q3 16.293
95-th percentile 1149.6
Maximum 2789.8
Range 2787.1
Interquartile range 11.922

Descriptive statistics

Standard deviation 391.81
Coef of variation 2.9404
Kurtosis 14.745
Mean 133.25
MAD 208.57
Skewness 3.7646
Sum 472780
Variance 153520
Memory size 27.8 KiB
Value Count Frequency (%)  
4.37 1784 50.3%
 
4.46 7 0.2%
 
5.32 6 0.2%
 
5.3 6 0.2%
 
5.18 6 0.2%
 
7.68 6 0.2%
 
4.59 5 0.1%
 
5.51 5 0.1%
 
4.65 5 0.1%
 
8.12 5 0.1%
 
Other values (1390) 1713 48.3%
 

Minimum 5 values

Value Count Frequency (%)  
2.73 1 0.0%
 
2.85 1 0.0%
 
2.94 2 0.1%
 
3.05 1 0.0%
 
3.17 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2702.77 1 0.0%
 
2705.16 1 0.0%
 
2736.61 1 0.0%
 
2754.35 1 0.0%
 
2789.8 1 0.0%
 

Value_SP500_REAL_SALES_GROWTH_QUARTER
Numeric

Distinct count 66
Unique (%) 1.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -1.7438
Minimum -15.81
Maximum 8.89
Zeros (%) 0.0%

Quantile statistics

Minimum -15.81
5-th percentile -1.8
Q1 -1.8
Median -1.8
Q3 -1.8
95-th percentile -1.8
Maximum 8.89
Range 24.7
Interquartile range 0

Descriptive statistics

Standard deviation 0.86472
Coef of variation -0.49588
Kurtosis 96.112
Mean -1.7438
MAD 0.15904
Skewness 3.3383
Sum -6187
Variance 0.74774
Memory size 27.8 KiB
Value Count Frequency (%)  
-1.8 3483 98.2%
 
7.95 1 0.0%
 
7.72 1 0.0%
 
-6.85 1 0.0%
 
5.72 1 0.0%
 
7.23 1 0.0%
 
1.13 1 0.0%
 
2.88 1 0.0%
 
4.39 1 0.0%
 
5.09 1 0.0%
 
Other values (56) 56 1.6%
 

Minimum 5 values

Value Count Frequency (%)  
-15.81 1 0.0%
 
-12.69 1 0.0%
 
-12.66 1 0.0%
 
-9.91 1 0.0%
 
-8.35 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
7.53 1 0.0%
 
7.72 1 0.0%
 
7.95 1 0.0%
 
8.87 1 0.0%
 
8.89 1 0.0%
 

Value_SP500_REAL_SALES_GROWTH_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_GROWTH_YEAR and should be ignored for analysis

Correlation 0.99905

Value_SP500_REAL_SALES_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis

Correlation 0.98094

Value_SP500_REAL_SALES_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_GROWTH_YEAR and should be ignored for analysis

Correlation 0.91447

Value_SP500_SALES_GROWTH_QUARTER
Numeric

Distinct count 67
Unique (%) 1.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -16.087
Minimum -16.46
Maximum 12.56
Zeros (%) 0.0%

Quantile statistics

Minimum -16.46
5-th percentile -16.46
Q1 -16.46
Median -16.46
Q3 -16.46
95-th percentile -16.46
Maximum 12.56
Range 29.02
Interquartile range 0

Descriptive statistics

Standard deviation 2.8312
Coef of variation -0.176
Kurtosis 61.813
Mean -16.087
MAD 0.73302
Skewness 7.8448
Sum -57075
Variance 8.0156
Memory size 27.8 KiB
Value Count Frequency (%)  
-16.46 3482 98.1%
 
-2.47 1 0.0%
 
1.7 1 0.0%
 
0.2 1 0.0%
 
-8.45 1 0.0%
 
8.94 1 0.0%
 
7.62 1 0.0%
 
9.96 1 0.0%
 
8.04 1 0.0%
 
4.81 1 0.0%
 
Other values (57) 57 1.6%
 

Minimum 5 values

Value Count Frequency (%)  
-16.46 3482 98.1%
 
-12.86 1 0.0%
 
-11.95 1 0.0%
 
-8.45 1 0.0%
 
-7.97 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
10.93 1 0.0%
 
11.01 1 0.0%
 
11.29 1 0.0%
 
11.99 1 0.0%
 
12.56 1 0.0%
 

Value_SP500_SALES_GROWTH_YEAR
Numeric

Distinct count 21
Unique (%) 0.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -12.761
Minimum -12.86
Maximum 10.93
Zeros (%) 0.0%

Quantile statistics

Minimum -12.86
5-th percentile -12.86
Q1 -12.86
Median -12.86
Q3 -12.86
95-th percentile -12.86
Maximum 10.93
Range 23.79
Interquartile range 0

Descriptive statistics

Standard deviation 1.3605
Coef of variation -0.10661
Kurtosis 208.16
Mean -12.761
MAD 0.19651
Skewness 14.282
Sum -45277
Variance 1.8508
Memory size 27.8 KiB
Value Count Frequency (%)  
-12.86 3528 99.4%
 
2.24 1 0.0%
 
5.98 1 0.0%
 
5.37 1 0.0%
 
10.93 1 0.0%
 
4.16 1 0.0%
 
-1.18 1 0.0%
 
7.03 1 0.0%
 
1.7 1 0.0%
 
3.76 1 0.0%
 
Other values (11) 11 0.3%
 

Minimum 5 values

Value Count Frequency (%)  
-12.86 3528 99.4%
 
-8.45 1 0.0%
 
-3.11 1 0.0%
 
-1.18 1 0.0%
 
1.7 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
8.94 1 0.0%
 
9.03 1 0.0%
 
9.36 1 0.0%
 
10.88 1 0.0%
 
10.93 1 0.0%
 

Value_SP500_SALES_QUARTER
Numeric

Distinct count 71
Unique (%) 2.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 680.53
Minimum 674.59
Maximum 1292.8
Zeros (%) 0.0%

Quantile statistics

Minimum 674.59
5-th percentile 674.59
Q1 674.59
Median 674.59
Q3 674.59
95-th percentile 674.59
Maximum 1292.8
Range 618.25
Interquartile range 0

Descriptive statistics

Standard deviation 48.068
Coef of variation 0.070634
Kurtosis 79.358
Mean 680.53
MAD 11.646
Skewness 8.7691
Sum 2414500
Variance 2310.6
Memory size 27.8 KiB
Value Count Frequency (%)  
674.59 3478 98.0%
 
1127.13 1 0.0%
 
917.93 1 0.0%
 
998.54 1 0.0%
 
1185.81 1 0.0%
 
828.1 1 0.0%
 
1136.16 1 0.0%
 
738.81 1 0.0%
 
965.19 1 0.0%
 
981.21 1 0.0%
 
Other values (61) 61 1.7%
 

Minimum 5 values

Value Count Frequency (%)  
674.59 3478 98.0%
 
678.6 1 0.0%
 
684.42 1 0.0%
 
697.75 1 0.0%
 
697.9 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
1185.81 1 0.0%
 
1203.1 1 0.0%
 
1231.57 1 0.0%
 
1259.18 1 0.0%
 
1292.84 1 0.0%
 

Value_SP500_SALES_YEAR
Numeric

Distinct count 22
Unique (%) 0.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 676.69
Minimum 674.59
Maximum 1292.8
Zeros (%) 0.0%

Quantile statistics

Minimum 674.59
5-th percentile 674.59
Q1 674.59
Median 674.59
Q3 674.59
95-th percentile 674.59
Maximum 1292.8
Range 618.25
Interquartile range 0

Descriptive statistics

Standard deviation 30.328
Coef of variation 0.044818
Kurtosis 255.59
Mean 676.69
MAD 4.169
Skewness 15.665
Sum 2400900
Variance 919.76
Memory size 27.8 KiB
Value Count Frequency (%)  
674.59 3527 99.4%
 
1127.13 1 0.0%
 
1092.37 1 0.0%
 
1203.1 1 0.0%
 
1292.84 1 0.0%
 
1169.42 1 0.0%
 
1163.32 1 0.0%
 
1042.46 1 0.0%
 
874.32 1 0.0%
 
710.81 1 0.0%
 
Other values (12) 12 0.3%
 

Minimum 5 values

Value Count Frequency (%)  
674.59 3527 99.4%
 
710.81 1 0.0%
 
736.88 1 0.0%
 
745.7 1 0.0%
 
788.17 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
1169.42 1 0.0%
 
1203.1 1 0.0%
 
1231.57 1 0.0%
 
1259.18 1 0.0%
 
1292.84 1 0.0%
 

Correlations

Sample

Value_SP500_REAL_PRICE_MONTH Value_SP500_DIV_YIELD_MONTH Value_SP500_PE_RATIO_MONTH Value_SHILLER_PE_RATIO_MONTH Value_SP500_EARNINGS_YIELD_MONTH Value_SP500_INFLADJ_MONTH Value_SP500_PSR_QUARTER Value_SP500_DIV_MONTH Value_SP500_DIV_YEAR Value_SP500_DIV_GROWTH_YEAR Value_SP500_DIV_GROWTH_QUARTER Value_SP500_PBV_RATIO_QUARTER Value_SHILLER_PE_RATIO_YEAR Value_SP500_PE_RATIO_YEAR Value_SP500_DIV_YIELD_YEAR Value_SP500_PSR_YEAR Value_SP500_EARNINGS_YIELD_YEAR Value_SP500_PBV_RATIO_YEAR Value_SP500_INFLADJ_YEAR Value_SP500_SALES_YEAR Value_SP500_SALES_GROWTH_YEAR Value_SP500_SALES_QUARTER Value_SP500_REAL_SALES_GROWTH_QUARTER Value_SP500_SALES_GROWTH_QUARTER Value_SP500_REAL_SALES_GROWTH_YEAR Value_SP500_REAL_EARNINGS_GROWTH_YEAR Value_SP500_REAL_SALES_YEAR Value_SP500_REAL_EARNINGS_GROWTH_QUARTER Value_SP500_EARNINGS_GROWTH_QUARTER Value_SP500_REAL_SALES_QUARTER Value_SP500_EARNINGS_MONTH Value_SP500_BVPS_YEAR Value_SP500_EARNINGS_YEAR Value_SP500_EARNINGS_GROWTH_YEAR Value_SP500_BVPS_QUARTER
Date
1871-01-01 4.44 4.22 11.10 11.34 9.01 89.81 1.43 7.19 6.70 -21.07 11.89 2.58 11.9 11.10 4.07 1.31 9.01 2.17 89.81 674.59 -12.86 674.59 -1.8 -16.46 -12.69 -79.48 943.84 11.38 17.75 942.41 8.83 290.68 7.95 -77.52 290.68
1871-01-31 4.37 5.86 15.61 11.34 5.62 99.41 1.43 5.26 5.15 -21.07 11.89 2.58 11.9 11.82 5.86 1.31 5.33 2.17 81.79 674.59 -12.86 674.59 -1.8 -16.46 -12.69 -79.48 943.84 11.38 17.75 942.41 8.09 290.68 7.92 -77.52 290.68
1871-02-01 4.50 4.22 11.25 10.92 8.89 88.33 1.43 7.19 6.70 -21.07 11.89 2.58 11.9 11.82 4.07 1.31 5.33 2.17 81.79 674.59 -12.86 674.59 -1.8 -16.46 -12.69 -79.48 943.84 11.38 17.75 942.41 8.83 290.68 7.95 -77.52 290.68
1871-02-28 4.37 5.78 15.61 11.34 5.62 99.41 1.43 5.10 6.70 -21.07 11.89 2.58 11.9 11.82 4.07 1.31 5.33 2.17 81.79 674.59 -12.86 674.59 -1.8 -16.46 -12.69 -79.48 943.84 11.38 17.75 942.41 7.85 290.68 7.95 -77.52 290.68
1871-03-01 4.61 4.22 11.52 11.19 8.68 89.17 1.43 7.19 6.70 -21.07 11.89 2.58 11.9 11.82 4.07 1.31 5.33 2.17 81.79 674.59 -12.86 674.59 -1.8 -16.46 -12.69 -79.48 943.84 11.38 17.75 942.41 8.83 290.68 7.95 -77.52 290.68
In [29]:
#Apply linear interpolation on the dataset.
df_interpolate = df.interpolate(method='linear',axis=0,inplace=False,limit_direction='both')
In [30]:
print(df_imputed.shape)
print(df_interpolate.shape)
(3548, 35)
(3548, 35)
In [31]:
#Full profile on interpolated dataset.
pandas_profiling.ProfileReport(df_interpolate)
Out[31]:

Overview

Dataset info

Number of variables 36
Number of observations 3548
Total Missing (%) 0.0%
Total size in memory 998.0 KiB
Average record size in memory 288.0 B

Variables types

Numeric 13
Categorical 0
Boolean 0
Date 1
Text (Unique) 0
Rejected 22
Unsupported 0

Warnings

Variables

Date
Date

Distinct count 3548
Unique (%) 100.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Minimum 1871-01-01 00:00:00
Maximum 2018-12-31 00:00:00

Value_SHILLER_PE_RATIO_MONTH
Numeric

Distinct count 2354
Unique (%) 66.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 16.578
Minimum 4.78
Maximum 44.19
Zeros (%) 0.0%

Quantile statistics

Minimum 4.78
5-th percentile 7.9768
Q1 11.62
Median 15.655
Q3 20.152
95-th percentile 28.102
Maximum 44.19
Range 39.41
Interquartile range 8.5325

Descriptive statistics

Standard deviation 6.676
Coef of variation 0.4027
Kurtosis 1.9619
Mean 16.578
MAD 5.1107
Skewness 1.1209
Sum 58820
Variance 44.57
Memory size 27.8 KiB
Value Count Frequency (%)  
13.8 7 0.2%
 
11.34 7 0.2%
 
17.65 7 0.2%
 
15.23 7 0.2%
 
17.82 7 0.2%
 
12.43 6 0.2%
 
10.91 6 0.2%
 
11.64 6 0.2%
 
15.27 6 0.2%
 
18.07 6 0.2%
 
Other values (2344) 3483 98.2%
 

Minimum 5 values

Value Count Frequency (%)  
4.78 1 0.0%
 
4.95 1 0.0%
 
4.955 1 0.0%
 
5.02 1 0.0%
 
5.04 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
43.7 1 0.0%
 
43.77 1 0.0%
 
43.83 1 0.0%
 
43.980000000000004 1 0.0%
 
44.19 1 0.0%
 

Value_SHILLER_PE_RATIO_YEAR
Highly correlated

This variable is highly correlated with Value_SHILLER_PE_RATIO_MONTH and should be ignored for analysis

Correlation 0.98989

Value_SP500_BVPS_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_BVPS_YEAR and should be ignored for analysis

Correlation 0.99959

Value_SP500_BVPS_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis

Correlation 0.97772

Value_SP500_DIV_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_DIV_GROWTH_YEAR and should be ignored for analysis

Correlation 0.99279

Value_SP500_DIV_GROWTH_YEAR
Numeric

Distinct count 689
Unique (%) 19.4%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 11.924
Minimum -21.07
Maximum 18.25
Zeros (%) 0.0%

Quantile statistics

Minimum -21.07
5-th percentile 2.3661
Q1 13.38
Median 13.38
Q3 13.38
95-th percentile 13.38
Maximum 18.25
Range 39.32
Interquartile range 0

Descriptive statistics

Standard deviation 4.2003
Coef of variation 0.35224
Kurtosis 15.155
Mean 11.924
MAD 2.5096
Skewness -3.5077
Sum 42308
Variance 17.643
Memory size 27.8 KiB
Value Count Frequency (%)  
13.38 2856 80.5%
 
8.65 4 0.1%
 
10.68 2 0.1%
 
13.4225 1 0.0%
 
4.31625 1 0.0%
 
16.26 1 0.0%
 
11.246666666666666 1 0.0%
 
13.151666666666667 1 0.0%
 
17.255000000000003 1 0.0%
 
12.135 1 0.0%
 
Other values (679) 679 19.1%
 

Minimum 5 values

Value Count Frequency (%)  
-21.07 1 0.0%
 
-20.131666666666668 1 0.0%
 
-20.09375 1 0.0%
 
-19.193333333333335 1 0.0%
 
-19.1175 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
17.989166666666666 1 0.0%
 
18.00125 1 0.0%
 
18.08416666666667 1 0.0%
 
18.167083333333334 1 0.0%
 
18.25 1 0.0%
 

Value_SP500_DIV_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_INFLADJ_MONTH and should be ignored for analysis

Correlation 0.91991

Value_SP500_DIV_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_MONTH and should be ignored for analysis

Correlation 0.99956

Value_SP500_DIV_YIELD_MONTH
Numeric

Distinct count 1295
Unique (%) 36.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.3469
Minimum 1.11
Maximum 13.84
Zeros (%) 0.0%

Quantile statistics

Minimum 1.11
5-th percentile 1.74
Q1 3.1588
Median 4.285
Q3 5.39
95-th percentile 7.18
Maximum 13.84
Range 12.73
Interquartile range 2.2312

Descriptive statistics

Standard deviation 1.6983
Coef of variation 0.39069
Kurtosis 0.77793
Mean 4.3469
MAD 1.3423
Skewness 0.47674
Sum 15423
Variance 2.8842
Memory size 27.8 KiB
Value Count Frequency (%)  
5.18 15 0.4%
 
5.01 13 0.4%
 
1.94 12 0.3%
 
1.76 12 0.3%
 
5.22 12 0.3%
 
3.7 11 0.3%
 
4.22 11 0.3%
 
4.92 10 0.3%
 
2.96 10 0.3%
 
3.49 10 0.3%
 
Other values (1285) 3432 96.7%
 

Minimum 5 values

Value Count Frequency (%)  
1.11 3 0.1%
 
1.12 1 0.0%
 
1.13 1 0.0%
 
1.1349999999999998 1 0.0%
 
1.14 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
12.46 1 0.0%
 
12.64 1 0.0%
 
13.15 1 0.0%
 
13.24 1 0.0%
 
13.84 1 0.0%
 

Value_SP500_DIV_YIELD_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YIELD_MONTH and should be ignored for analysis

Correlation 0.97677

Value_SP500_EARNINGS_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.99925

Value_SP500_EARNINGS_GROWTH_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_YEAR and should be ignored for analysis

Correlation 0.99856

Value_SP500_EARNINGS_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_INFLADJ_YEAR and should be ignored for analysis

Correlation 0.9169

Value_SP500_EARNINGS_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_EARNINGS_MONTH and should be ignored for analysis

Correlation 0.99519

Value_SP500_EARNINGS_YIELD_MONTH
Numeric

Distinct count 1649
Unique (%) 46.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 7.3645
Minimum 0.81
Maximum 18.82
Zeros (%) 0.0%

Quantile statistics

Minimum 0.81
5-th percentile 3.8938
Q1 5.55
Median 6.78
Q3 8.755
95-th percentile 12.783
Maximum 18.82
Range 18.01
Interquartile range 3.205

Descriptive statistics

Standard deviation 2.7021
Coef of variation 0.36691
Kurtosis 1.1874
Mean 7.3645
MAD 2.0858
Skewness 0.96555
Sum 26129
Variance 7.3012
Memory size 27.8 KiB
Value Count Frequency (%)  
5.62 15 0.4%
 
5.29 13 0.4%
 
5.54 13 0.4%
 
5.33 12 0.3%
 
5.72 11 0.3%
 
6.23 11 0.3%
 
5.3 11 0.3%
 
5.38 11 0.3%
 
5.48 11 0.3%
 
5.26 11 0.3%
 
Other values (1639) 3429 96.6%
 

Minimum 5 values

Value Count Frequency (%)  
0.81 3 0.1%
 
0.8200000000000001 1 0.0%
 
0.83 1 0.0%
 
0.87 1 0.0%
 
0.895 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
17.845 1 0.0%
 
18.12 1 0.0%
 
18.48 1 0.0%
 
18.65 1 0.0%
 
18.82 1 0.0%
 

Value_SP500_EARNINGS_YIELD_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_EARNINGS_YIELD_MONTH and should be ignored for analysis

Correlation 0.98185

Value_SP500_INFLADJ_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_REAL_PRICE_MONTH and should be ignored for analysis

Correlation 0.9681

Value_SP500_INFLADJ_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YEAR and should be ignored for analysis

Correlation 0.92035

Value_SP500_PBV_RATIO_QUARTER
Numeric

Distinct count 333
Unique (%) 9.4%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.7592
Minimum 1.78
Maximum 5.06
Zeros (%) 0.0%

Quantile statistics

Minimum 1.78
5-th percentile 2.6767
Q1 5.05
Median 5.05
Q3 5.05
95-th percentile 5.05
Maximum 5.06
Range 3.28
Interquartile range 0

Descriptive statistics

Standard deviation 0.79199
Coef of variation 0.16641
Kurtosis 4.4417
Mean 4.7592
MAD 0.50877
Skewness -2.475
Sum 16886
Variance 0.62724
Memory size 27.8 KiB
Value Count Frequency (%)  
5.05 3096 87.3%
 
2.76 9 0.3%
 
2.91 8 0.2%
 
2.74 7 0.2%
 
2.73 5 0.1%
 
2.19 4 0.1%
 
2.63 4 0.1%
 
2.58 4 0.1%
 
2.81 4 0.1%
 
2.763333333333333 4 0.1%
 
Other values (323) 403 11.4%
 

Minimum 5 values

Value Count Frequency (%)  
1.78 1 0.0%
 
1.7983333333333333 1 0.0%
 
1.8166666666666667 2 0.1%
 
1.835 1 0.0%
 
1.85 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
5.053333333333333 1 0.0%
 
5.055 1 0.0%
 
5.056666666666667 1 0.0%
 
5.058333333333333 1 0.0%
 
5.06 1 0.0%
 

Value_SP500_PBV_RATIO_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PBV_RATIO_QUARTER and should be ignored for analysis

Correlation 0.99857

Value_SP500_PE_RATIO_MONTH
Numeric

Distinct count 2218
Unique (%) 62.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.731
Minimum 5.31
Maximum 123.73
Zeros (%) 0.0%

Quantile statistics

Minimum 5.31
5-th percentile 7.8202
Q1 11.429
Median 14.75
Q3 18.03
95-th percentile 25.691
Maximum 123.73
Range 118.42
Interquartile range 6.6013

Descriptive statistics

Standard deviation 8.3823
Coef of variation 0.53285
Kurtosis 67.083
Mean 15.731
MAD 4.6349
Skewness 6.4167
Sum 55814
Variance 70.263
Memory size 27.8 KiB
Value Count Frequency (%)  
15.61 8 0.2%
 
12.21 7 0.2%
 
9.84 7 0.2%
 
10.34 7 0.2%
 
14.51 6 0.2%
 
11.48 6 0.2%
 
14.75 6 0.2%
 
10.98 6 0.2%
 
19.0 6 0.2%
 
15.11 5 0.1%
 
Other values (2208) 3484 98.2%
 

Minimum 5 values

Value Count Frequency (%)  
5.31 1 0.0%
 
5.359999999999999 1 0.0%
 
5.41 1 0.0%
 
5.525 1 0.0%
 
5.609999999999999 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
119.85 1 0.0%
 
121.78999999999999 1 0.0%
 
123.32 1 0.0%
 
123.525 1 0.0%
 
123.73 1 0.0%
 

Value_SP500_PE_RATIO_YEAR
Numeric

Distinct count 3346
Unique (%) 94.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.741
Minimum 5.74
Maximum 70.91
Zeros (%) 0.0%

Quantile statistics

Minimum 5.74
5-th percentile 8.0235
Q1 11.653
Median 15.053
Q3 18.011
95-th percentile 25.639
Maximum 70.91
Range 65.17
Interquartile range 6.3581

Descriptive statistics

Standard deviation 6.6443
Coef of variation 0.42211
Kurtosis 14.325
Mean 15.741
MAD 4.3673
Skewness 2.7959
Sum 55848
Variance 44.147
Memory size 27.8 KiB
Value Count Frequency (%)  
18.08 3 0.1%
 
17.496666666666666 3 0.1%
 
15.59 3 0.1%
 
11.82 3 0.1%
 
14.835 3 0.1%
 
17.45 3 0.1%
 
16.85 3 0.1%
 
17.21666666666667 3 0.1%
 
16.3 3 0.1%
 
12.95125 3 0.1%
 
Other values (3336) 3518 99.2%
 

Minimum 5 values

Value Count Frequency (%)  
5.74 1 0.0%
 
5.765000000000001 1 0.0%
 
5.79 1 0.0%
 
5.815 1 0.0%
 
5.832916666666667 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
66.72583333333333 1 0.0%
 
66.78916666666666 1 0.0%
 
68.81791666666666 1 0.0%
 
68.84958333333333 1 0.0%
 
70.91 1 0.0%
 

Value_SP500_PSR_QUARTER
Numeric

Distinct count 300
Unique (%) 8.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.738
Minimum 0.8
Maximum 2.31
Zeros (%) 0.0%

Quantile statistics

Minimum 0.8
5-th percentile 1.4339
Q1 1.77
Median 1.77
Q3 1.77
95-th percentile 1.77
Maximum 2.31
Range 1.51
Interquartile range 0

Descriptive statistics

Standard deviation 0.13545
Coef of variation 0.077933
Kurtosis 13.779
Mean 1.738
MAD 0.067544
Skewness -3.163
Sum 6166.4
Variance 0.018346
Memory size 27.8 KiB
Value Count Frequency (%)  
1.77 3122 88.0%
 
1.44 10 0.3%
 
1.66 7 0.2%
 
1.52 7 0.2%
 
2.1 7 0.2%
 
1.43 6 0.2%
 
1.46 6 0.2%
 
1.3166666666666667 5 0.1%
 
1.47 4 0.1%
 
1.33 4 0.1%
 
Other values (290) 370 10.4%
 

Minimum 5 values

Value Count Frequency (%)  
0.8 1 0.0%
 
0.8116666666666668 1 0.0%
 
0.8233333333333334 1 0.0%
 
0.8283333333333334 1 0.0%
 
0.835 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2.19 1 0.0%
 
2.205 1 0.0%
 
2.25 1 0.0%
 
2.2575000000000003 1 0.0%
 
2.31 1 0.0%
 

Value_SP500_PSR_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PSR_QUARTER and should be ignored for analysis

Correlation 0.98853

Value_SP500_REAL_EARNINGS_GROWTH_QUARTER
Numeric

Distinct count 675
Unique (%) 19.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -1.6836
Minimum -90.27
Maximum 905.56
Zeros (%) 0.0%

Quantile statistics

Minimum -90.27
5-th percentile -7.94
Q1 -7.94
Median -7.94
Q3 -7.94
95-th percentile 16.01
Maximum 905.56
Range 995.83
Interquartile range 0

Descriptive statistics

Standard deviation 56.051
Coef of variation -33.292
Kurtosis 184.12
Mean -1.6836
MAD 13.005
Skewness 12.982
Sum -5973.4
Variance 3141.7
Memory size 27.8 KiB
Value Count Frequency (%)  
-7.94 2856 80.5%
 
14.93 9 0.3%
 
11.38 7 0.2%
 
-0.54 2 0.1%
 
-12.535 2 0.1%
 
-13.59 2 0.1%
 
14.05 2 0.1%
 
13.870000000000001 1 0.0%
 
-14.786666666666667 1 0.0%
 
-17.93999999999999 1 0.0%
 
Other values (665) 665 18.7%
 

Minimum 5 values

Value Count Frequency (%)  
-90.27 1 0.0%
 
-89.69166666666666 1 0.0%
 
-89.11333333333333 1 0.0%
 
-88.535 1 0.0%
 
-88.47166666666666 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
882.12 1 0.0%
 
887.98 1 0.0%
 
893.8399999999999 1 0.0%
 
899.6999999999999 1 0.0%
 
905.56 1 0.0%
 

Value_SP500_REAL_EARNINGS_GROWTH_YEAR
Numeric

Distinct count 685
Unique (%) 19.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -3.8153
Minimum -79.48
Maximum 261.66
Zeros (%) 0.0%

Quantile statistics

Minimum -79.48
5-th percentile -7.94
Q1 -7.94
Median -7.94
Q3 -7.94
95-th percentile 16.192
Maximum 261.66
Range 341.14
Interquartile range 0

Descriptive statistics

Standard deviation 20.964
Coef of variation -5.4947
Kurtosis 69.189
Mean -3.8153
MAD 8.4864
Skewness 7.1384
Sum -13537
Variance 439.48
Memory size 27.8 KiB
Value Count Frequency (%)  
-7.94 2856 80.5%
 
14.93 9 0.3%
 
-15.020833333333332 1 0.0%
 
4.810000000000002 1 0.0%
 
12.56375 1 0.0%
 
-28.35375 1 0.0%
 
22.085 1 0.0%
 
34.3975 1 0.0%
 
-0.12124999999999997 1 0.0%
 
7.221666666666667 1 0.0%
 
Other values (675) 675 19.0%
 

Minimum 5 values

Value Count Frequency (%)  
-79.48 1 0.0%
 
-77.04541666666668 1 0.0%
 
-74.61083333333335 1 0.0%
 
-72.17625000000001 1 0.0%
 
-69.74166666666667 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
235.16750000000002 1 0.0%
 
243.99833333333336 1 0.0%
 
247.44583333333338 1 0.0%
 
252.82916666666668 1 0.0%
 
261.66 1 0.0%
 

Value_SP500_REAL_PRICE_MONTH
Numeric

Distinct count 2827
Unique (%) 79.7%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 267.14
Minimum 2.73
Maximum 2789.8
Zeros (%) 0.0%

Quantile statistics

Minimum 2.73
5-th percentile 4.4
Q1 7.8138
Median 16.555
Q3 128.36
95-th percentile 1428.9
Maximum 2789.8
Range 2787.1
Interquartile range 120.55

Descriptive statistics

Standard deviation 533.7
Coef of variation 1.9978
Kurtosis 5.5084
Mean 267.14
MAD 366.21
Skewness 2.4362
Sum 947810
Variance 284830
Memory size 27.8 KiB
Value Count Frequency (%)  
7.68 11 0.3%
 
5.18 9 0.3%
 
4.37 9 0.3%
 
4.46 9 0.3%
 
2736.61 8 0.2%
 
5.25 8 0.2%
 
9.3 7 0.2%
 
4.54 7 0.2%
 
8.83 6 0.2%
 
5.33 6 0.2%
 
Other values (2817) 3468 97.7%
 

Minimum 5 values

Value Count Frequency (%)  
2.73 1 0.0%
 
2.79 1 0.0%
 
2.835 1 0.0%
 
2.85 1 0.0%
 
2.94 3 0.1%
 

Maximum 5 values

Value Count Frequency (%)  
2736.61 8 0.2%
 
2745.48 1 0.0%
 
2747.48 1 0.0%
 
2754.35 1 0.0%
 
2789.8 1 0.0%
 

Value_SP500_REAL_SALES_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_SALES_GROWTH_YEAR and should be ignored for analysis

Correlation 0.9797

Value_SP500_REAL_SALES_GROWTH_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.96667

Value_SP500_REAL_SALES_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_YEAR and should be ignored for analysis

Correlation 0.99537

Value_SP500_REAL_SALES_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis

Correlation 0.90087

Value_SP500_SALES_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.99017

Value_SP500_SALES_GROWTH_YEAR
Numeric

Distinct count 394
Unique (%) 11.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -0.64806
Minimum -12.86
Maximum 10.93
Zeros (%) 0.0%

Quantile statistics

Minimum -12.86
5-th percentile -1.18
Q1 -1.18
Median -1.18
Q3 -1.18
95-th percentile 5.1897
Maximum 10.93
Range 23.79
Interquartile range 0

Descriptive statistics

Standard deviation 2.4013
Coef of variation -3.7053
Kurtosis 11.202
Mean -0.64806
MAD 1.1652
Skewness 2.698
Sum -2299.3
Variance 5.766
Memory size 27.8 KiB
Value Count Frequency (%)  
-1.18 3144 88.6%
 
9.03 9 0.3%
 
3.92 2 0.1%
 
2.62 2 0.1%
 
3.68 2 0.1%
 
8.94 1 0.0%
 
6.684166666666667 1 0.0%
 
10.266666666666666 1 0.0%
 
1.44 1 0.0%
 
-2.6916666666666664 1 0.0%
 
Other values (384) 384 10.8%
 

Minimum 5 values

Value Count Frequency (%)  
-12.86 1 0.0%
 
-12.253333333333332 1 0.0%
 
-12.075 1 0.0%
 
-11.646666666666665 1 0.0%
 
-11.29 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
10.921666666666667 1 0.0%
 
10.92375 1 0.0%
 
10.925833333333333 1 0.0%
 
10.927916666666667 1 0.0%
 
10.93 1 0.0%
 

Value_SP500_SALES_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_SALES_YEAR and should be ignored for analysis

Correlation 0.99943

Value_SP500_SALES_YEAR
Numeric

Distinct count 421
Unique (%) 11.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 773.69
Minimum 674.59
Maximum 1292.8
Zeros (%) 0.0%

Quantile statistics

Minimum 674.59
5-th percentile 745.7
Q1 745.7
Median 745.7
Q3 745.7
95-th percentile 1036.8
Maximum 1292.8
Range 618.25
Interquartile range 0

Descriptive statistics

Standard deviation 96.243
Coef of variation 0.12439
Kurtosis 10.629
Mean 773.69
MAD 52.026
Skewness 3.3842
Sum 2745100
Variance 9262.8
Memory size 27.8 KiB
Value Count Frequency (%)  
745.7 3120 87.9%
 
1292.84 9 0.3%
 
1172.2266666666667 1 0.0%
 
791.7595833333332 1 0.0%
 
1140.8675 1 0.0%
 
870.7304166666668 1 0.0%
 
1240.7733333333333 1 0.0%
 
1236.1716666666666 1 0.0%
 
966.465 1 0.0%
 
775.2766666666666 1 0.0%
 
Other values (411) 411 11.6%
 

Minimum 5 values

Value Count Frequency (%)  
674.59 1 0.0%
 
676.0991666666667 1 0.0%
 
677.1854166666667 1 0.0%
 
677.6083333333333 1 0.0%
 
679.1175000000001 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
1270.4 1 0.0%
 
1276.01 1 0.0%
 
1281.62 1 0.0%
 
1287.23 1 0.0%
 
1292.84 9 0.3%
 

Correlations

Sample

Value_SP500_REAL_PRICE_MONTH Value_SP500_DIV_YIELD_MONTH Value_SP500_PE_RATIO_MONTH Value_SHILLER_PE_RATIO_MONTH Value_SP500_EARNINGS_YIELD_MONTH Value_SP500_INFLADJ_MONTH Value_SP500_PSR_QUARTER Value_SP500_DIV_MONTH Value_SP500_DIV_YEAR Value_SP500_DIV_GROWTH_YEAR Value_SP500_DIV_GROWTH_QUARTER Value_SP500_PBV_RATIO_QUARTER Value_SHILLER_PE_RATIO_YEAR Value_SP500_PE_RATIO_YEAR Value_SP500_DIV_YIELD_YEAR Value_SP500_PSR_YEAR Value_SP500_EARNINGS_YIELD_YEAR Value_SP500_PBV_RATIO_YEAR Value_SP500_INFLADJ_YEAR Value_SP500_SALES_YEAR Value_SP500_SALES_GROWTH_YEAR Value_SP500_SALES_QUARTER Value_SP500_REAL_SALES_GROWTH_QUARTER Value_SP500_SALES_GROWTH_QUARTER Value_SP500_REAL_SALES_GROWTH_YEAR Value_SP500_REAL_EARNINGS_GROWTH_YEAR Value_SP500_REAL_SALES_YEAR Value_SP500_REAL_EARNINGS_GROWTH_QUARTER Value_SP500_EARNINGS_GROWTH_QUARTER Value_SP500_REAL_SALES_QUARTER Value_SP500_EARNINGS_MONTH Value_SP500_BVPS_YEAR Value_SP500_EARNINGS_YEAR Value_SP500_EARNINGS_GROWTH_YEAR Value_SP500_BVPS_QUARTER
Date
1871-01-01 4.440 5.86 11.100 10.920 9.010 89.81 1.77 5.260 5.150000 13.38 13.38 5.05 11.9 11.100000 5.860000 1.77 9.010000 5.05 89.8100 745.7 -1.18 745.7 -3.66 -1.18 -3.66 -7.94 1087.39 -7.94 -3.71 1087.39 8.090 290.68 7.920000 -3.71 290.68
1871-01-31 4.470 5.86 11.175 10.920 8.950 89.07 1.77 5.260 5.150000 13.38 13.38 5.05 11.9 11.140417 5.860000 1.77 8.979583 5.05 90.1025 745.7 -1.18 745.7 -3.66 -1.18 -3.66 -7.94 1087.39 -7.94 -3.71 1087.39 8.090 290.68 7.920000 -3.71 290.68
1871-02-01 4.500 5.82 11.250 10.920 8.890 88.33 1.77 5.180 5.151364 13.38 13.38 5.05 11.9 11.180833 5.843182 1.77 8.949167 5.05 90.3950 745.7 -1.18 745.7 -3.66 -1.18 -3.66 -7.94 1087.39 -7.94 -3.71 1087.39 7.970 290.68 7.922273 -3.71 290.68
1871-02-28 4.555 5.78 11.385 11.055 8.785 88.75 1.77 5.100 5.152727 13.38 13.38 5.05 11.9 11.221250 5.826364 1.77 8.918750 5.05 90.6875 745.7 -1.18 745.7 -3.66 -1.18 -3.66 -7.94 1087.39 -7.94 -3.71 1087.39 7.850 290.68 7.924545 -3.71 290.68
1871-03-01 4.610 5.71 11.520 11.190 8.680 89.17 1.77 5.065 5.154091 13.38 13.38 5.05 11.9 11.261667 5.809545 1.77 8.888333 5.05 90.9800 745.7 -1.18 745.7 -3.66 -1.18 -3.66 -7.94 1087.39 -7.94 -3.71 1087.39 7.795 290.68 7.926818 -3.71 290.68
In [32]:
autocorrelation_plot(df_imputed)
plt.show()
In [33]:
autocorrelation_plot(df_interpolate)
plt.show()
In [34]:
#Corelation plot after imputation
plt.figure(figsize=(20,15))
sns.heatmap(df_imputed.corr(),annot=True,fmt='.2f',square=False)
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e5333594e0>
In [35]:
#Correlation heatmap plot after interpolation
plt.figure(figsize=(20,15))
sns.heatmap(df_interpolate.corr(),annot=True,fmt='.2f',square=False)
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e538499eb8>
In [36]:
#Check for missing values per column and create a graph
Column_missing_values(df_imputed)
Column_missing_values(df_interpolate)
No missing values in provided dataframe
No missing values in provided dataframe
In [37]:
df_imputed.plot(figsize=(50,30),fontsize=30)
Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e535eb1e10>
In [38]:
df_interpolate.plot(figsize=(30,20),fontsize=20)
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e537aaf518>
In [39]:
df.hist(figsize=(15,10))
Out[39]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001E537DDC898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E537E114E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E537E34E80>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E537E63908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E537E942E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E537E94320>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E537EE6828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E537F172E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E537F3BD68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E537F69828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E537F992E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E537FBED68>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E537FED828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53801E2E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538043D68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538621828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5386502E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538675D68>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E5386A3828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5386D62E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5386F9D68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5387254E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53874BF60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53877AA20>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E538C294E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538C51F60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538C7FA20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538CAF4E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538CD7F60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538D04A20>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E538D334E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538D58F60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538D88A20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538DB84E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538DDDF60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E538E0CA20>]],
      dtype=object)
In [40]:
df_imputed.hist(figsize=(30,20))
Out[40]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001E538E4FEB8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E539078240>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53AF996A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5391A5048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5391C2AC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5391C2B00>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E53920D048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E539234A90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E539261550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53928AFD0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5392B6A90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5392E8550>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E53930DFD0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53933AA90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53936C550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E539392FD0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5393BEA90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5393EF550>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E539416FD0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E539443A90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E539472550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53949BFD0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5394C8A90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5394F7550>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E53951DFD0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53954BA90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53957C550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5395A2FD0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5395CE748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5395FF208>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E539624C88>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E539651748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E539686208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5396AAC88>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E5396D7748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E539709208>]],
      dtype=object)
In [41]:
#Histogram plot of all variables in interpolated dataset
df_interpolate.hist(figsize=(30,20))
Out[41]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BB8C390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53B7148D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BAD1748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BC79C50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BC99710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BC99748>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BCE0C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BD0F6D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BD3C198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BD63C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BD926D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BDC1198>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BDE7C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BE156D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BE47198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BE6AC18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BE9A6D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BECA198>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BEEEC18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BF1F6D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BF4D198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BF75C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BFA26D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BFD2198>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E53BFF5C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53C0256D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53C056198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53C07AC18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53C0AB6D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53C0DB198>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001E53C0FFC18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53C12D6D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53C15E198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53C182C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53C1B16D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001E53C1E5198>]],
      dtype=object)

Final analysis on interpolation and imputation datasets.

linear Interpolation on missing values looks promising and there is better correlation between variables of the dataset. Imputation has not shown better correlation between variables. I would use interpolation dataframe for further analysis.

Write dfs to csv files for further reference.

save df, df_imputed and df_iterpolated to csv file.

In [42]:
#Write dfs to csv file.
df.to_csv('SandP500_Index_Master.csv',index=False)
df_imputed.to_csv('SandP500_Index_df_imputed.csv',index=False)
df_interpolate.to_csv('SandP500_Index_df_Interpolated.csv',index=False)

Feature Scaling

Apply StandardScaler() to the imputed df and interpolated df to normalize feature values.
In [43]:
# Apply feature scaling on all values to the entire numerical dataframe.define function
def Apply_Standard_Scaler(df):
    '''
    This funtion applies StandardScaler() to columns/features of a given dataframe.
    IN- a pandas dataframe
    OUT - df_scaled_features dataframe of scaled features.
    scaler - StandardScaler() object.    
    '''
    scaler = StandardScaler()
    scaled_features = StandardScaler().fit_transform(df.values)
    df_scaled_features = pd.DataFrame(scaled_features,index=df.index,columns = df.columns)
    #df_scaled_features.describe()
    
    return df_scaled_features, scaler
In [44]:
print ('Number of columns present in imputed and interpolated datasets are : {} & {}'.format (
    len(df_imputed.columns),len(df_interpolate.columns)))
Number of columns present in imputed and interpolated datasets are : 35 & 35
In [45]:
# Apply standard scaling function to both dfs and return scaled df.
df_scaled_features_imputed, scaler_df_imputed = Apply_Standard_Scaler(df_imputed)
df_scaled_features_interpolation, scaler_df_interpolate = Apply_Standard_Scaler(df_interpolate)

Apply Principal Component Analysis (PCA) for feature selection

Apply PCA feature extraction analysis to find groups of features with highest and lowest variance.
In [46]:
#function to apply PCA feature scaling
def scree_plot(pca):
    '''
    Creates a scree plot associated with the principal components 
    
    INPUT: pca - the result of instance of PCA in scikit learn
            
    OUTPUT:
            None
    '''
    num_components = len(pca.explained_variance_ratio_)
    ind = np.arange(num_components)
    vals = pca.explained_variance_ratio_
 
    plt.figure(figsize=(25, 10))
    ax = plt.subplot(111)
    cumvals = np.cumsum(vals)
    ax.bar(ind, vals)
    ax.plot(ind, cumvals)
    #print (ind, cumvals)
    for i in range(num_components):
        ax.annotate(r"%s%%" % ((str(vals[i]*100)[:4])), (ind[i]+0.2, vals[i]), va="bottom", ha="center", fontsize=12)
     
 
    ax.xaxis.set_tick_params(width=0)
    ax.yaxis.set_tick_params(width=2, length=12)
 
    ax.set_xlabel("Principal Component")
    ax.set_ylabel("Variance Explained (%)")
    plt.title('Explained Variance Per Principal Component')
    

# Apply PCA to the data for all features
def Apply_PCA(df_scaled_features,n_components):
    '''
    This function would create a PCA object in scikit-learn with n_components and apply fit_transform function of PCA on scaled features df.
    This function internally calls another 'scree_plot' function to create a chart of feature variance.
    IN - 
    df_scaled_features - numerically scaled dataframe.
    n_components - n number of components for PCA feature analysis
    
    Output- Returns PCA object with provided n_components
    '''
    pca = PCA(n_components)
    pca_scaled_features = pca.fit_transform(df_scaled_features)
    scree_plot(pca)
    
    return pca


# Map weights for the first principal component to corresponding feature names
# and then print the linked values, sorted by weight.
def sorted_weights(pca, ix, dataset):
    """
    Docstring- map the weights and components from PCA analysis.
    Input parameters-
    Input - pca initialized model
    ix = index number of first set of components.
    dataset = a dataframe of scaled features
    
    Output- A Tuple of features with variance.
    Prints a bar chart with feature names and related variance.
    """
    a1 = pca.components_[ix]
    a2 = dataset.keys().values
    a = list(zip(a1, a2))
    a.sort(key=lambda tup: tup[0])
    x_labels = [val[0] for val in a]
    y_labels = [val[1] for val in a]
    plt.Figure(figsize=(20, 15))
    ax = pd.Series(x_labels).plot(kind='bar')
    ax.set_xticklabels(y_labels)
    rects = ax.patches
    # for rect, label in zip(rects, x_labels):
    #     height = rect.get_height()
    #     ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
    return a


#Cluster of of features with highest variance
def Print_PCAfeatures_graph(df,a,n):
    '''
    Function to print PCA features in line chart with top n lowest variance or top n highest variance for a dataframe.
    IN- 
    df- dataframe on which PCA analysis was done.
    a - result from 'sorted_weights' function. A sequence of 2-d array with sorted weights of features.
    n- number of features required to be printed in the chart. Negative (-n) shall show features with positive variance while 
    positive (n) would show features with negative variance.
    
    OUT- name of features from cluster a.
    '''    
    groups = []
    for i in range(len(a)):
        groups.append(a[i][1])
    
    #Slice 1-D array appropriately
    if n < 0:
        groups = groups[n:]
    else:
        groups = groups[:n]
    
    #print(groups)
    
    i = 1
    for group in groups:
        plt.subplot(len(groups), 1, i)
        plt.plot(df[group].values)
        plt.title(group, y=0.5, loc='center')
        i += 1
    plt.figure(figsize=(30,20))
    plt.show()
    
    return groups
Visualize PCA feature extraction and variance for imputed and interpolated dfs
In [47]:
#Apply PCA for all features except target output.
pca_imputed = Apply_PCA(df_scaled_features_imputed.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1),n_components=10)
In [48]:
#Apply PCA for all features except target output.
pca_interpolated = Apply_PCA(df_scaled_features_interpolation.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1),n_components=10)
Visualize weights of features for first group of extracted features with highest variance
In [49]:
# #Cluster of of features with highest variance
# def Print_PCAfeatures_graph(df,a,n):
#     '''
#     Function to print PCA features in line chart with top n lowest variance or top n highest variance for a dataframe.
#     IN- 
#     df- dataframe on which PCA analysis was done.
#     a - result from 'sorted_weights' function. A sequence of 2-d array with sorted weights of features.
#     n- number of features required to be printed in the chart. Negative (-n) shall show features with maximum variance while 
#     positive n would show features with lowest variance.
    
#     OUT- name of features from cluster a.
#     '''    
#     groups = []
#     for i in range(len(a)):
#         groups.append(a[i][1])
    
#     #Slice 1-D array appropriately
#     if n < 0:
#         groups = groups[n:]
#     else:
#         groups = groups[:n]
    
#     #print(groups)
    
#     i = 1
#     for group in groups:
#         plt.subplot(len(groups), 1, i)
#         plt.plot(df[group].values)
#         plt.title(group, y=0.5, loc='center')
#         i += 1
#     plt.figure(figsize=(30,20))
#     plt.show()
    
#     return groups
In [50]:
#List cluster of features with highest variance from PCA analysis
a = sorted_weights(pca_imputed,1,df_scaled_features_imputed)
In [51]:
#print chart with top 5 features with maximum variance
Print_PCAfeatures_graph(df_imputed,a,-5)
<Figure size 3000x2000 with 0 Axes>
Out[51]:
['Value_SHILLER_PE_RATIO_YEAR',
 'Value_SP500_EARNINGS_YIELD_MONTH',
 'Value_SP500_PE_RATIO_MONTH',
 'Value_SP500_PBV_RATIO_YEAR',
 'Value_SP500_PBV_RATIO_QUARTER']
In [52]:
#print chart with top 5 features with lowest variance
Print_PCAfeatures_graph(df_imputed,a,5)
<Figure size 3000x2000 with 0 Axes>
Out[52]:
['Value_SHILLER_PE_RATIO_MONTH',
 'Value_SP500_PSR_QUARTER',
 'Value_SP500_REAL_SALES_QUARTER',
 'Value_SP500_PSR_YEAR',
 'Value_SP500_REAL_SALES_GROWTH_QUARTER']
In [53]:
b = sorted_weights(pca_interpolated,1,df_scaled_features_interpolation)
b
Out[53]:
[(-0.3099270964617896, 'Value_SP500_DIV_GROWTH_YEAR'),
 (-0.304721029314339, 'Value_SP500_DIV_YEAR'),
 (-0.2470383278900263, 'Value_SP500_REAL_SALES_GROWTH_QUARTER'),
 (-0.2387458673708968, 'Value_SP500_SALES_YEAR'),
 (-0.23832544559286753, 'Value_SP500_SALES_QUARTER'),
 (-0.22862267710171633, 'Value_SP500_SALES_GROWTH_QUARTER'),
 (-0.2276975766969925, 'Value_SP500_EARNINGS_GROWTH_QUARTER'),
 (-0.22053633106204035, 'Value_SP500_REAL_EARNINGS_GROWTH_YEAR'),
 (-0.18570532863264608, 'Value_SHILLER_PE_RATIO_MONTH'),
 (-0.1803014408947123, 'Value_SP500_PSR_YEAR'),
 (-0.14675080518992648, 'Value_SP500_SALES_GROWTH_YEAR'),
 (-0.14261472452590387, 'Value_SP500_INFLADJ_YEAR'),
 (-0.13310173004335485, 'Value_SP500_PE_RATIO_YEAR'),
 (-0.13165915021671848, 'Value_SP500_REAL_PRICE_MONTH'),
 (-0.1275332154522863, 'Value_SP500_INFLADJ_MONTH'),
 (-0.11806250612104158, 'Value_SP500_EARNINGS_GROWTH_YEAR'),
 (-0.11776297826352891, 'Value_SP500_EARNINGS_MONTH'),
 (-0.1168244365989096, 'Value_SP500_DIV_YIELD_YEAR'),
 (-0.06950409932183269, 'Value_SP500_REAL_SALES_QUARTER'),
 (-0.05601635917739762, 'Value_SP500_BVPS_YEAR'),
 (-0.02714587682828222, 'Value_SP500_PSR_QUARTER'),
 (-0.02606010953354218, 'Value_SP500_DIV_MONTH'),
 (-0.007376927107058928, 'Value_SP500_DIV_GROWTH_QUARTER'),
 (-0.002314480823035312, 'Value_SP500_EARNINGS_YIELD_YEAR'),
 (0.02256896923088267, 'Value_SP500_EARNINGS_YIELD_MONTH'),
 (0.025265445913568784, 'Value_SP500_PBV_RATIO_YEAR'),
 (0.1255333992926099, 'Value_SP500_PE_RATIO_MONTH'),
 (0.1287789446050626, 'Value_SP500_PBV_RATIO_QUARTER'),
 (0.16520849173637447, 'Value_SP500_REAL_EARNINGS_GROWTH_QUARTER'),
 (0.16883681596348415, 'Value_SP500_REAL_SALES_YEAR'),
 (0.20153174127326595, 'Value_SHILLER_PE_RATIO_YEAR'),
 (0.2157723138756866, 'Value_SP500_EARNINGS_YEAR'),
 (0.21697622894808202, 'Value_SP500_DIV_YIELD_MONTH'),
 (0.2211050245648737, 'Value_SP500_REAL_SALES_GROWTH_YEAR')]
In [54]:
#print chart with top 5 features with maximum variance
Print_PCAfeatures_graph(df_interpolate,b,-6)
<Figure size 3000x2000 with 0 Axes>
Out[54]:
['Value_SP500_REAL_EARNINGS_GROWTH_QUARTER',
 'Value_SP500_REAL_SALES_YEAR',
 'Value_SHILLER_PE_RATIO_YEAR',
 'Value_SP500_EARNINGS_YEAR',
 'Value_SP500_DIV_YIELD_MONTH',
 'Value_SP500_REAL_SALES_GROWTH_YEAR']
In [55]:
Print_PCAfeatures_graph(df_interpolate,b,8)
<Figure size 3000x2000 with 0 Axes>
Out[55]:
['Value_SP500_DIV_GROWTH_YEAR',
 'Value_SP500_DIV_YEAR',
 'Value_SP500_REAL_SALES_GROWTH_QUARTER',
 'Value_SP500_SALES_YEAR',
 'Value_SP500_SALES_QUARTER',
 'Value_SP500_SALES_GROWTH_QUARTER',
 'Value_SP500_EARNINGS_GROWTH_QUARTER',
 'Value_SP500_REAL_EARNINGS_GROWTH_YEAR']

Split dataset into Train and Test set.

Split 80% of dataset into train and 20% into test based on chronological order because its a timeseries. Avoid look-ahead bias by doing this.
In [56]:
def Create_Training_Test_Dataset(df,split_percent,Linear_regr):
    '''
    This function would split, slice and create training and test datasets. Provide 'Value_SP500_REAL_PRICE_MONTH' in the input 
    df. It would input to X
    
    IN- df- a dataframe from which training and test dataset needs to slice.
    split_percent - split percent for training and test dataset.
    Linear_regr - A flag to split between dataset for linear regression object or LSTM network. Works for this project only.
    
    OUT- X_train, Y_train, X_test, Y_test
    
    '''
       
    # Split the size into 80% and 20% based on rows.
    train_size = int(len(df) * split_percent)
    test_size = len(df) - train_size
    #print(train_size,test_size)
    print('Training and Test dataset is of size {} & {}'.format(train_size,test_size))
    
    #Slice the df into train and test df.
    train = df.iloc[0:train_size,:]
    test = df.iloc[train_size:len(df),:]
    #print(train.shape, test.shape)
    
    #Check for Linear Regression flag from user input. If false it would return dataset for LSTM neural network.
    if Linear_regr == 'False':
        #Create Training dataset
        temp_train = train.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
        X_train = temp_train.iloc[0:train_size,:]
        #X_train.head()
        Y_train = train.iloc[0:train_size,:1]
        #print(Y_train.head())
        #print(X_train.shape,Y_train.shape)
        #print(X_train.columns,Y_train.columns)
        print('Features size of X_train and training target Y_train shape is {} & {}'.format(X_train.shape,Y_train.shape))

        #Define Test dataset
        temp_test = test.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
        X_test = temp_test.iloc[0:test_size,:]
        #X_train.head()
        Y_test = test.iloc[0:test_size,:1]
        #print(Y_train.head())
        #print(X_test.shape,Y_test.shape)
        #print(X_test.columns,Y_test.columns)
        print('Features size of X_test and Test target Y_test shape is {} & {}'.format(X_test.shape,Y_test.shape))
        
    else:
        #Train dataset
        X_train = train['Value_SP500_REAL_PRICE_MONTH'][0:train_size]
        Y_train = train['Value_SP500_REAL_PRICE_MONTH'][0:test_size]
        print('Features size of X_train and training target Y_train shape is {} & {}'.format(X_train.shape,Y_train.shape))
            
        #Test dataset
        X_test = test['Value_SP500_REAL_PRICE_MONTH'][0:train_size]
        Y_test = test['Value_SP500_REAL_PRICE_MONTH'][0:test_size]
        print('Features size of X_test and Test target Y_test shape is {} & {}'.format(X_test.shape,Y_test.shape))
        

    return X_train, Y_train, X_test, Y_test
    
def Convert_dataset_nparray(X_train, Y_train, X_test, Y_test):
    '''
    This function would convert the training and test dataset to np.array.
    In- X_train, Y_train, X_test, Y_test
    
    OUT- np.array of X_train, Y_train, X_test, Y_test
    
    '''
    #Convert to np array as required for LSTM model.
    X_train = np.array(X_train)
    Y_train = np.array(Y_train)
    #print(X_train.shape,Y_train.shape)
    print('Training dataset is converted to np.array with size {} & {}'.format(X_train.shape,Y_train.shape))

    #Convert test dataset to np.array.
    X_test = np.array(X_test)
    Y_test = np.array(Y_test)
    #print(X_test.shape,Y_test.shape)
    print('Test dataset is converted to np.array with size {} & {}'.format(X_test.shape,Y_test.shape))
    
    return X_train, Y_train, X_test, Y_test

Define and Fit LSTM model in keras

Multivariate time-series prediction
In [57]:
# from subprocess import check_output
# from keras.layers.core import Dense, Activation, Dropout
# from keras.layers.recurrent import LSTM
# from keras.layers.embeddings import Embedding
# from keras.models import Sequential
# from keras.layers import LSTM, CuDNNLSTM , BatchNormalization
# import tensorflow as tf
# from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint


# import time
# from numpy import newaxis
In [58]:
# #Build Model
# model = Sequential()

# model.add(LSTM(input_dim=1,output_dim=50,return_sequences=True))
# model.add(Dropout(0.2))

# model.add(LSTM(100,return_sequences=False))
# model.add(Dropout(0.2))

# model.add(Dense(output_dim=1))
# model.add(Activation('linear'))

# start = time.time()
# model.compile(loss='mse', optimizer='rmsprop')
# print ('compilation time : ', time.time() - start)
In [59]:
# #Build the model
# model = Sequential()
# model.add(LSTM(256,input_shape=(2837,34)))
# model.add(Dense(1))
# model.compile(optimizer='adam',loss='mse')
# #Reshape data for (Sample,Timestep,Features) 
# #X_train = X_train.reshape((X_train.shape[0],X_train.shape[1],1))
# #X_test = X_test.reshape((X_test.shape[0],X_test.shape[1],1))
# #Fit model with history to check for overfitting
# #history = model.fit(X_train,y_train,epochs=300,validation_data=(X_test,y_test),shuffle=False)
In [60]:
#model.fit(X_train,Y_train,batch_size=128,epochs=10,validation_split=0.05)
In [61]:
# model = Sequential([
#     Dense(32, input_shape=(2837,34)),
#     Activation('relu'),
#     Dense(10),
#     Activation('softmax'),
# ])

# #Compile
# # For a mean squared error regression problem
# model.compile(optimizer='rmsprop',
#               loss='mse')
In [62]:
df_interpolate.columns
Out[62]:
Index(['Value_SP500_REAL_PRICE_MONTH', 'Value_SP500_DIV_YIELD_MONTH',
       'Value_SP500_PE_RATIO_MONTH', 'Value_SHILLER_PE_RATIO_MONTH',
       'Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_INFLADJ_MONTH',
       'Value_SP500_PSR_QUARTER', 'Value_SP500_DIV_MONTH',
       'Value_SP500_DIV_YEAR', 'Value_SP500_DIV_GROWTH_YEAR',
       'Value_SP500_DIV_GROWTH_QUARTER', 'Value_SP500_PBV_RATIO_QUARTER',
       'Value_SHILLER_PE_RATIO_YEAR', 'Value_SP500_PE_RATIO_YEAR',
       'Value_SP500_DIV_YIELD_YEAR', 'Value_SP500_PSR_YEAR',
       'Value_SP500_EARNINGS_YIELD_YEAR', 'Value_SP500_PBV_RATIO_YEAR',
       'Value_SP500_INFLADJ_YEAR', 'Value_SP500_SALES_YEAR',
       'Value_SP500_SALES_GROWTH_YEAR', 'Value_SP500_SALES_QUARTER',
       'Value_SP500_REAL_SALES_GROWTH_QUARTER',
       'Value_SP500_SALES_GROWTH_QUARTER',
       'Value_SP500_REAL_SALES_GROWTH_YEAR',
       'Value_SP500_REAL_EARNINGS_GROWTH_YEAR', 'Value_SP500_REAL_SALES_YEAR',
       'Value_SP500_REAL_EARNINGS_GROWTH_QUARTER',
       'Value_SP500_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_SALES_QUARTER',
       'Value_SP500_EARNINGS_MONTH', 'Value_SP500_BVPS_YEAR',
       'Value_SP500_EARNINGS_YEAR', 'Value_SP500_EARNINGS_GROWTH_YEAR',
       'Value_SP500_BVPS_QUARTER'],
      dtype='object')
In [159]:
#Not using PCA features here because the model resulted in huge mse errors. I think monthly features should be used which could 
#predict the real price correctly.
df_PCA_features = df_interpolate.loc[:,['Value_SP500_REAL_PRICE_MONTH',
                                        'Value_SP500_DIV_YIELD_MONTH',
                                        'Value_SP500_PE_RATIO_MONTH',
                                        'Value_SHILLER_PE_RATIO_MONTH',
                                        'Value_SP500_EARNINGS_YIELD_MONTH',
                                        'Value_SP500_INFLADJ_MONTH',
                                        'Value_SP500_EARNINGS_MONTH','Value_SP500_PSR_QUARTER','Value_SP500_SALES_QUARTER',
                                       'Value_SP500_REAL_SALES_GROWTH_QUARTER','Value_SP500_REAL_EARNINGS_GROWTH_QUARTER']]

df_PCA_features.shape
Out[159]:
(3548, 11)
In [160]:
#apply scaling
scaler_df_imputed = StandardScaler()
scaled_features = scaler_df_imputed.fit_transform(df_PCA_features.values)
df_scaled_features_interpol = pd.DataFrame(scaled_features,index=df_PCA_features.index,columns = df_PCA_features.columns)
#df_scaled_features.describe()
In [161]:
#Use real data
#X_train, Y_train, X_test, Y_test = Create_Training_Test_Dataset(df=df_PCA_features, split_percent=0.95, Linear_regr='False')
#X_train, Y_train, X_test, Y_test = Convert_dataset_nparray(X_train, Y_train, X_test, Y_test)
In [162]:
#Split master dataframe into training and test datasets
X_train, Y_train, X_test, Y_test = Create_Training_Test_Dataset(df=df_PCA_features, split_percent=0.98, Linear_regr='False')

X_train, Y_train, X_test, Y_test = Convert_dataset_nparray(X_train, Y_train, X_test, Y_test)
Training and Test dataset is of size 3477 & 71
Features size of X_train and training target Y_train shape is (3477, 10) & (3477, 1)
Features size of X_test and Test target Y_test shape is (71, 10) & (71, 1)
Training dataset is converted to np.array with size (3477, 10) & (3477, 1)
Test dataset is converted to np.array with size (71, 10) & (71, 1)
In [163]:
#Shape training data.
#the inputs (X) are reshaped into the 3D format expected by LSTM, namely [samples, timesteps, features]
X_train = X_train.reshape((X_train.shape[0],1,10))
#Y_train = Y_train.reshape((Y_train.shape[0],1,1)) #Do not reshape it. 
print(X_train.shape,Y_train.shape)
print(X_train.shape[1],X_train.shape[2])

#Shape test dataset correctly for LSTM predict.
X_test = X_test.reshape((X_test.shape[0],1,10))
#Y_test = Y_test.reshape((Y_test.shape[0],Y_test.shape[1],1))
print(X_test.shape,Y_test.shape)
print('X_test input shape : ', X_test.shape[0],X_test.shape[1],X_test.shape[2])
(3477, 1, 10) (3477, 1)
1 10
(71, 1, 10) (71, 1)
X_test input shape :  71 1 10
In [164]:
# #Design network for mean squared error regression problem
# model = Sequential()
# model.add(LSTM(512, input_shape=(X_train.shape[1], X_train.shape[2])))
# model.add(Dropout(0.2))
# Activation('relu')
# model.add(Dense(1))
# Activation('softmax')

# # Compiling the model using mean square error loss, and Adam optimizer.
# model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
# #model.compile(loss='mse',optimizer='rmsprop',metrics=['accuracy'])
In [165]:
# max_features = 1024

# model = Sequential()
# model.add(Embedding(max_features, output_dim=256))
# model.add(LSTM(128))
# model.add(Dropout(0.5))
# model.add(Dense(1, activation='sigmoid'))

# model.compile(loss='binary_crossentropy',
#               optimizer='rmsprop',
#               metrics=['accuracy'])
In [166]:
#Design network for mean squared error regression problem
model = Sequential()
#The first dimension is supposed to be each sample.input should be (n_samples, timesteps, n_features)
model.add(LSTM(512, input_shape=(X_train.shape[1],X_train.shape[2])))
model.add(Dropout(0.2))
#model.add(BatchNormalization())

# model.add(LSTM(256, input_shape=(X_train.shape[0],1,X_train.shape[2]),return_sequences=True))
# model.add(Dropout(0.1))
# #model.add(BatchNormalization())

Activation('relu')
model.add(Dense(1))

#Activation('softmax')

# Compiling the model using mean square error loss, and Adam optimizer.
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
#model.compile(loss='mse',optimizer='rmsprop',metrics=['accuracy'])
filepath = "RNN_final-{epoch:02d}-{val_acc:.3f}"#unique file name which will include epochs and validation accuracy score.
checkpoint = ModelCheckpoint("models/{}.model".format(filepath,monitor = 'val_acc',verbose = 1))

print(model.summary)
<bound method Network.summary of <keras.engine.sequential.Sequential object at 0x000001E54D6F5F98>>
In [167]:
# fit network with epochs
history = model.fit(X_train, Y_train, epochs=500, batch_size=100, validation_data=(X_train, Y_train), verbose=2,
                    callbacks = [checkpoint],
                    shuffle=False)

model.save('LSTM_model',overwrite=True,include_optimizer=True)
Train on 3477 samples, validate on 3477 samples
Epoch 1/500
 - 3s - loss: 243339.4370 - acc: 0.0000e+00 - val_loss: 242714.4353 - val_acc: 0.0000e+00
Epoch 2/500
 - 1s - loss: 241162.7605 - acc: 0.0000e+00 - val_loss: 240804.1092 - val_acc: 2.8760e-04
Epoch 3/500
 - 1s - loss: 239919.4807 - acc: 0.0000e+00 - val_loss: 239705.8956 - val_acc: 2.8760e-04
Epoch 4/500
 - 1s - loss: 238994.8505 - acc: 0.0000e+00 - val_loss: 238805.9905 - val_acc: 0.0000e+00
Epoch 5/500
 - 1s - loss: 238148.2302 - acc: 0.0000e+00 - val_loss: 237966.7018 - val_acc: 0.0000e+00
Epoch 6/500
 - 1s - loss: 237272.5937 - acc: 0.0000e+00 - val_loss: 237097.1663 - val_acc: 0.0000e+00
Epoch 7/500
 - 1s - loss: 236451.4622 - acc: 0.0000e+00 - val_loss: 236272.0884 - val_acc: 0.0000e+00
Epoch 8/500
 - 1s - loss: 235675.5565 - acc: 2.8760e-04 - val_loss: 235472.4969 - val_acc: 2.8760e-04
Epoch 9/500
 - 1s - loss: 234850.1869 - acc: 0.0000e+00 - val_loss: 234682.3712 - val_acc: 0.0000e+00
Epoch 10/500
 - 1s - loss: 234135.3833 - acc: 2.8760e-04 - val_loss: 233926.5933 - val_acc: 0.0000e+00
Epoch 11/500
 - 1s - loss: 233322.8948 - acc: 0.0000e+00 - val_loss: 233148.9700 - val_acc: 0.0000e+00
Epoch 12/500
 - 1s - loss: 232576.8213 - acc: 0.0000e+00 - val_loss: 232353.4952 - val_acc: 0.0000e+00
Epoch 13/500
 - 1s - loss: 231753.3019 - acc: 0.0000e+00 - val_loss: 231597.8444 - val_acc: 2.8760e-04
Epoch 14/500
 - 1s - loss: 231032.9564 - acc: 2.8760e-04 - val_loss: 230852.7865 - val_acc: 0.0000e+00
Epoch 15/500
 - 1s - loss: 230292.6349 - acc: 2.8760e-04 - val_loss: 230112.4808 - val_acc: 0.0000e+00
Epoch 16/500
 - 1s - loss: 229526.3014 - acc: 0.0000e+00 - val_loss: 229379.9607 - val_acc: 0.0000e+00
Epoch 17/500
 - 1s - loss: 228825.9301 - acc: 2.8760e-04 - val_loss: 228661.3356 - val_acc: 0.0000e+00
Epoch 18/500
 - 2s - loss: 228118.5561 - acc: 0.0000e+00 - val_loss: 227955.8876 - val_acc: 0.0000e+00
Epoch 19/500
 - 2s - loss: 227397.1509 - acc: 0.0000e+00 - val_loss: 227261.4097 - val_acc: 0.0000e+00
Epoch 20/500
 - 2s - loss: 226751.8984 - acc: 0.0000e+00 - val_loss: 226573.6249 - val_acc: 0.0000e+00
Epoch 21/500
 - 2s - loss: 226037.4590 - acc: 2.8760e-04 - val_loss: 225894.3047 - val_acc: 2.8760e-04
Epoch 22/500
 - 2s - loss: 225569.3964 - acc: 0.0000e+00 - val_loss: 225315.5182 - val_acc: 0.0000e+00
Epoch 23/500
 - 2s - loss: 224719.4619 - acc: 0.0000e+00 - val_loss: 224562.6777 - val_acc: 0.0000e+00
Epoch 24/500
 - 2s - loss: 224061.0709 - acc: 0.0000e+00 - val_loss: 223905.0793 - val_acc: 0.0000e+00
Epoch 25/500
 - 2s - loss: 223420.1020 - acc: 0.0000e+00 - val_loss: 223248.1206 - val_acc: 0.0000e+00
Epoch 26/500
 - 1s - loss: 222702.2231 - acc: 0.0000e+00 - val_loss: 222581.2486 - val_acc: 0.0000e+00
Epoch 27/500
 - 1s - loss: 222100.0577 - acc: 2.8760e-04 - val_loss: 221942.7014 - val_acc: 0.0000e+00
Epoch 28/500
 - 2s - loss: 221483.1033 - acc: 2.8760e-04 - val_loss: 221305.8124 - val_acc: 2.8760e-04
Epoch 29/500
 - 1s - loss: 221164.4045 - acc: 0.0000e+00 - val_loss: 220921.8952 - val_acc: 0.0000e+00
Epoch 30/500
 - 1s - loss: 220465.0957 - acc: 0.0000e+00 - val_loss: 220299.0549 - val_acc: 0.0000e+00
Epoch 31/500
 - 1s - loss: 219756.9181 - acc: 0.0000e+00 - val_loss: 219681.9905 - val_acc: 0.0000e+00
Epoch 32/500
 - 2s - loss: 219298.7041 - acc: 2.8760e-04 - val_loss: 219062.2763 - val_acc: 0.0000e+00
Epoch 33/500
 - 2s - loss: 218690.2736 - acc: 0.0000e+00 - val_loss: 218477.0263 - val_acc: 0.0000e+00
Epoch 34/500
 - 2s - loss: 218257.6683 - acc: 0.0000e+00 - val_loss: 218160.7385 - val_acc: 0.0000e+00
Epoch 35/500
 - 2s - loss: 217705.7209 - acc: 0.0000e+00 - val_loss: 217522.5719 - val_acc: 2.8760e-04
Epoch 36/500
 - 2s - loss: 216778.8285 - acc: 0.0000e+00 - val_loss: 216696.9238 - val_acc: 0.0000e+00
Epoch 37/500
 - 1s - loss: 216299.8198 - acc: 0.0000e+00 - val_loss: 216088.7995 - val_acc: 0.0000e+00
Epoch 38/500
 - 2s - loss: 215750.9451 - acc: 2.8760e-04 - val_loss: 215506.2003 - val_acc: 0.0000e+00
Epoch 39/500
 - 2s - loss: 215045.0029 - acc: 0.0000e+00 - val_loss: 214934.4999 - val_acc: 0.0000e+00
Epoch 40/500
 - 2s - loss: 214524.8656 - acc: 0.0000e+00 - val_loss: 214361.9306 - val_acc: 0.0000e+00
Epoch 41/500
 - 2s - loss: 213993.4389 - acc: 0.0000e+00 - val_loss: 213801.9505 - val_acc: 0.0000e+00
Epoch 42/500
 - 2s - loss: 213371.5354 - acc: 0.0000e+00 - val_loss: 213238.8752 - val_acc: 0.0000e+00
Epoch 43/500
 - 1s - loss: 212892.6655 - acc: 0.0000e+00 - val_loss: 212686.4150 - val_acc: 2.8760e-04
Epoch 44/500
 - 1s - loss: 212655.3949 - acc: 0.0000e+00 - val_loss: 212486.1022 - val_acc: 2.8760e-04
Epoch 45/500
 - 1s - loss: 211995.9674 - acc: 0.0000e+00 - val_loss: 211933.9086 - val_acc: 0.0000e+00
Epoch 46/500
 - 1s - loss: 211515.5962 - acc: 0.0000e+00 - val_loss: 211393.3153 - val_acc: 0.0000e+00
Epoch 47/500
 - 1s - loss: 210926.3843 - acc: 0.0000e+00 - val_loss: 210848.7520 - val_acc: 0.0000e+00
Epoch 48/500
 - 1s - loss: 211180.3884 - acc: 0.0000e+00 - val_loss: 210646.3960 - val_acc: 0.0000e+00
Epoch 49/500
 - 2s - loss: 209984.0191 - acc: 0.0000e+00 - val_loss: 209739.4114 - val_acc: 0.0000e+00
Epoch 50/500
 - 2s - loss: 209357.7884 - acc: 0.0000e+00 - val_loss: 209214.0551 - val_acc: 0.0000e+00
Epoch 51/500
 - 2s - loss: 208866.4859 - acc: 2.8760e-04 - val_loss: 208688.0837 - val_acc: 0.0000e+00
Epoch 52/500
 - 2s - loss: 208274.7003 - acc: 2.8760e-04 - val_loss: 208169.3333 - val_acc: 0.0000e+00
Epoch 53/500
 - 1s - loss: 207703.4854 - acc: 0.0000e+00 - val_loss: 207652.7864 - val_acc: 0.0000e+00
Epoch 54/500
 - 2s - loss: 207361.9242 - acc: 0.0000e+00 - val_loss: 207144.4490 - val_acc: 0.0000e+00
Epoch 55/500
 - 2s - loss: 206746.4984 - acc: 2.8760e-04 - val_loss: 206637.0927 - val_acc: 2.8760e-04
Epoch 56/500
 - 2s - loss: 206777.9321 - acc: 2.8760e-04 - val_loss: 206598.0003 - val_acc: 2.8760e-04
Epoch 57/500
 - 2s - loss: 206293.4617 - acc: 0.0000e+00 - val_loss: 206095.5555 - val_acc: 2.8760e-04
Epoch 58/500
 - 2s - loss: 205751.3194 - acc: 0.0000e+00 - val_loss: 205602.5550 - val_acc: 5.7521e-04
Epoch 59/500
 - 2s - loss: 205390.5106 - acc: 0.0000e+00 - val_loss: 205097.7249 - val_acc: 2.8760e-04
Epoch 60/500
 - 2s - loss: 204818.5186 - acc: 2.8760e-04 - val_loss: 204596.0372 - val_acc: 0.0000e+00
Epoch 61/500
 - 2s - loss: 204105.1385 - acc: 0.0000e+00 - val_loss: 204100.3282 - val_acc: 2.8760e-04
Epoch 62/500
 - 2s - loss: 203824.3437 - acc: 2.8760e-04 - val_loss: 203621.5036 - val_acc: 2.8760e-04
Epoch 63/500
 - 2s - loss: 203475.2630 - acc: 0.0000e+00 - val_loss: 203151.2190 - val_acc: 0.0000e+00
Epoch 64/500
 - 2s - loss: 202766.9528 - acc: 0.0000e+00 - val_loss: 202667.7471 - val_acc: 5.7521e-04
Epoch 65/500
 - 2s - loss: 202312.7902 - acc: 2.8760e-04 - val_loss: 202193.4177 - val_acc: 0.0000e+00
Epoch 66/500
 - 2s - loss: 202356.4267 - acc: 0.0000e+00 - val_loss: 202120.2318 - val_acc: 2.8760e-04
Epoch 67/500
 - 2s - loss: 201901.7359 - acc: 8.6281e-04 - val_loss: 201647.6772 - val_acc: 0.0000e+00
Epoch 68/500
 - 2s - loss: 201351.1909 - acc: 2.8760e-04 - val_loss: 201172.3936 - val_acc: 0.0000e+00
Epoch 69/500
 - 2s - loss: 200831.8690 - acc: 0.0000e+00 - val_loss: 200713.1985 - val_acc: 2.8760e-04
Epoch 70/500
 - 2s - loss: 200514.9758 - acc: 0.0000e+00 - val_loss: 200233.3952 - val_acc: 0.0000e+00
Epoch 71/500
 - 2s - loss: 199717.9855 - acc: 0.0000e+00 - val_loss: 199455.3688 - val_acc: 0.0000e+00
Epoch 72/500
 - 2s - loss: 199304.5319 - acc: 0.0000e+00 - val_loss: 199000.3805 - val_acc: 0.0000e+00
Epoch 73/500
 - 2s - loss: 198717.1111 - acc: 0.0000e+00 - val_loss: 198545.5219 - val_acc: 0.0000e+00
Epoch 74/500
 - 2s - loss: 198383.9670 - acc: 0.0000e+00 - val_loss: 198091.4415 - val_acc: 0.0000e+00
Epoch 75/500
 - 2s - loss: 197817.1290 - acc: 0.0000e+00 - val_loss: 197646.5827 - val_acc: 0.0000e+00
Epoch 76/500
 - 2s - loss: 197381.3781 - acc: 0.0000e+00 - val_loss: 197202.6017 - val_acc: 0.0000e+00
Epoch 77/500
 - 2s - loss: 196877.1992 - acc: 0.0000e+00 - val_loss: 196767.8716 - val_acc: 2.8760e-04
Epoch 78/500
 - 2s - loss: 196467.1117 - acc: 0.0000e+00 - val_loss: 196334.3113 - val_acc: 0.0000e+00
Epoch 79/500
 - 2s - loss: 196177.3487 - acc: 0.0000e+00 - val_loss: 195903.0680 - val_acc: 0.0000e+00
Epoch 80/500
 - 2s - loss: 195683.7580 - acc: 0.0000e+00 - val_loss: 195470.2095 - val_acc: 0.0000e+00
Epoch 81/500
 - 2s - loss: 195422.5725 - acc: 0.0000e+00 - val_loss: 195038.8352 - val_acc: 0.0000e+00
Epoch 82/500
 - 2s - loss: 194854.1778 - acc: 0.0000e+00 - val_loss: 194615.1241 - val_acc: 0.0000e+00
Epoch 83/500
 - 2s - loss: 194343.1340 - acc: 0.0000e+00 - val_loss: 194201.8916 - val_acc: 0.0000e+00
Epoch 84/500
 - 2s - loss: 193875.1634 - acc: 0.0000e+00 - val_loss: 193774.1580 - val_acc: 0.0000e+00
Epoch 85/500
 - 2s - loss: 193553.8551 - acc: 0.0000e+00 - val_loss: 193356.4870 - val_acc: 0.0000e+00
Epoch 86/500
 - 2s - loss: 193196.1364 - acc: 0.0000e+00 - val_loss: 192931.4692 - val_acc: 2.8760e-04
Epoch 87/500
 - 2s - loss: 192823.6895 - acc: 0.0000e+00 - val_loss: 192527.4390 - val_acc: 2.8760e-04
Epoch 88/500
 - 2s - loss: 192282.9051 - acc: 0.0000e+00 - val_loss: 192114.5593 - val_acc: 0.0000e+00
Epoch 89/500
 - 2s - loss: 191857.8865 - acc: 0.0000e+00 - val_loss: 191707.0105 - val_acc: 0.0000e+00
Epoch 90/500
 - 2s - loss: 191505.9370 - acc: 0.0000e+00 - val_loss: 191308.5476 - val_acc: 0.0000e+00
Epoch 91/500
 - 2s - loss: 191114.9983 - acc: 0.0000e+00 - val_loss: 190898.5700 - val_acc: 0.0000e+00
Epoch 92/500
 - 2s - loss: 190835.3250 - acc: 0.0000e+00 - val_loss: 190495.9148 - val_acc: 2.8760e-04
Epoch 93/500
 - 2s - loss: 190249.8917 - acc: 0.0000e+00 - val_loss: 190095.8689 - val_acc: 2.8760e-04
Epoch 94/500
 - 2s - loss: 190008.7924 - acc: 0.0000e+00 - val_loss: 189699.4791 - val_acc: 0.0000e+00
Epoch 95/500
 - 2s - loss: 189464.0374 - acc: 0.0000e+00 - val_loss: 189301.9428 - val_acc: 2.8760e-04
Epoch 96/500
 - 2s - loss: 189000.1108 - acc: 0.0000e+00 - val_loss: 188902.8807 - val_acc: 0.0000e+00
Epoch 97/500
 - 2s - loss: 188524.3779 - acc: 0.0000e+00 - val_loss: 188509.1563 - val_acc: 0.0000e+00
Epoch 98/500
 - 2s - loss: 188582.9736 - acc: 2.8760e-04 - val_loss: 187986.5844 - val_acc: 0.0000e+00
Epoch 99/500
 - 2s - loss: 187823.3821 - acc: 0.0000e+00 - val_loss: 187656.2729 - val_acc: 0.0000e+00
Epoch 100/500
 - 2s - loss: 187548.0945 - acc: 0.0000e+00 - val_loss: 187190.4434 - val_acc: 0.0000e+00
Epoch 101/500
 - 2s - loss: 187084.9576 - acc: 0.0000e+00 - val_loss: 186809.6147 - val_acc: 0.0000e+00
Epoch 102/500
 - 2s - loss: 187113.1767 - acc: 0.0000e+00 - val_loss: 186896.8456 - val_acc: 0.0000e+00
Epoch 103/500
 - 2s - loss: 186742.1331 - acc: 0.0000e+00 - val_loss: 186507.7935 - val_acc: 0.0000e+00
Epoch 104/500
 - 2s - loss: 186273.4697 - acc: 0.0000e+00 - val_loss: 186121.3310 - val_acc: 0.0000e+00
Epoch 105/500
 - 2s - loss: 185940.0060 - acc: 0.0000e+00 - val_loss: 185735.9088 - val_acc: 0.0000e+00
Epoch 106/500
 - 2s - loss: 185428.6046 - acc: 0.0000e+00 - val_loss: 185354.3255 - val_acc: 0.0000e+00
Epoch 107/500
 - 2s - loss: 185234.9341 - acc: 0.0000e+00 - val_loss: 184976.3639 - val_acc: 0.0000e+00
Epoch 108/500
 - 2s - loss: 184637.5837 - acc: 0.0000e+00 - val_loss: 184594.0943 - val_acc: 0.0000e+00
Epoch 109/500
 - 2s - loss: 184418.0348 - acc: 0.0000e+00 - val_loss: 184215.0162 - val_acc: 0.0000e+00
Epoch 110/500
 - 2s - loss: 184115.8884 - acc: 0.0000e+00 - val_loss: 183843.3211 - val_acc: 0.0000e+00
Epoch 111/500
 - 2s - loss: 183652.7114 - acc: 2.8760e-04 - val_loss: 183502.9403 - val_acc: 0.0000e+00
Epoch 112/500
 - 2s - loss: 183361.3243 - acc: 0.0000e+00 - val_loss: 183110.0005 - val_acc: 0.0000e+00
Epoch 113/500
 - 2s - loss: 183245.8812 - acc: 0.0000e+00 - val_loss: 182742.8113 - val_acc: 0.0000e+00
Epoch 114/500
 - 2s - loss: 182698.3783 - acc: 0.0000e+00 - val_loss: 182381.9065 - val_acc: 0.0000e+00
Epoch 115/500
 - 2s - loss: 182178.5259 - acc: 0.0000e+00 - val_loss: 182016.0790 - val_acc: 0.0000e+00
Epoch 116/500
 - 2s - loss: 182033.1843 - acc: 0.0000e+00 - val_loss: 181652.4819 - val_acc: 0.0000e+00
Epoch 117/500
 - 2s - loss: 181861.9280 - acc: 0.0000e+00 - val_loss: 181299.4412 - val_acc: 0.0000e+00
Epoch 118/500
 - 2s - loss: 181190.8525 - acc: 0.0000e+00 - val_loss: 180932.9580 - val_acc: 0.0000e+00
Epoch 119/500
 - 2s - loss: 180615.9637 - acc: 0.0000e+00 - val_loss: 180585.3063 - val_acc: 0.0000e+00
Epoch 120/500
 - 2s - loss: 180458.7817 - acc: 2.8760e-04 - val_loss: 180233.3298 - val_acc: 0.0000e+00
Epoch 121/500
 - 2s - loss: 180227.0807 - acc: 0.0000e+00 - val_loss: 179875.1620 - val_acc: 0.0000e+00
Epoch 122/500
 - 2s - loss: 179992.1728 - acc: 0.0000e+00 - val_loss: 179522.5561 - val_acc: 0.0000e+00
Epoch 123/500
 - 2s - loss: 179340.9631 - acc: 0.0000e+00 - val_loss: 179175.9631 - val_acc: 0.0000e+00
Epoch 124/500
 - 2s - loss: 179107.5935 - acc: 0.0000e+00 - val_loss: 178836.4184 - val_acc: 0.0000e+00
Epoch 125/500
 - 2s - loss: 178790.9279 - acc: 5.7521e-04 - val_loss: 178483.6317 - val_acc: 0.0000e+00
Epoch 126/500
 - 2s - loss: 178481.6867 - acc: 0.0000e+00 - val_loss: 178148.8725 - val_acc: 0.0000e+00
Epoch 127/500
 - 2s - loss: 178137.9250 - acc: 0.0000e+00 - val_loss: 177808.8997 - val_acc: 2.8760e-04
Epoch 128/500
 - 2s - loss: 177878.5549 - acc: 0.0000e+00 - val_loss: 177465.5303 - val_acc: 0.0000e+00
Epoch 129/500
 - 2s - loss: 177610.9472 - acc: 0.0000e+00 - val_loss: 177126.9104 - val_acc: 0.0000e+00
Epoch 130/500
 - 2s - loss: 177079.4112 - acc: 0.0000e+00 - val_loss: 176788.9154 - val_acc: 0.0000e+00
Epoch 131/500
 - 2s - loss: 176756.6301 - acc: 0.0000e+00 - val_loss: 176460.1091 - val_acc: 0.0000e+00
Epoch 132/500
 - 2s - loss: 176592.4682 - acc: 0.0000e+00 - val_loss: 176115.6201 - val_acc: 0.0000e+00
Epoch 133/500
 - 2s - loss: 176046.2411 - acc: 2.8760e-04 - val_loss: 175796.2173 - val_acc: 0.0000e+00
Epoch 134/500
 - 2s - loss: 175742.8151 - acc: 0.0000e+00 - val_loss: 175466.4175 - val_acc: 2.8760e-04
Epoch 135/500
 - 2s - loss: 175503.3632 - acc: 0.0000e+00 - val_loss: 175127.3169 - val_acc: 0.0000e+00
Epoch 136/500
 - 2s - loss: 174907.9096 - acc: 0.0000e+00 - val_loss: 174805.7643 - val_acc: 0.0000e+00
Epoch 137/500
 - 2s - loss: 174611.3577 - acc: 2.8760e-04 - val_loss: 174476.3127 - val_acc: 0.0000e+00
Epoch 138/500
 - 2s - loss: 174609.3507 - acc: 2.8760e-04 - val_loss: 174153.6887 - val_acc: 0.0000e+00
Epoch 139/500
 - 2s - loss: 174289.8259 - acc: 0.0000e+00 - val_loss: 173835.7960 - val_acc: 0.0000e+00
Epoch 140/500
 - 2s - loss: 173853.6221 - acc: 0.0000e+00 - val_loss: 173517.1795 - val_acc: 0.0000e+00
Epoch 141/500
 - 2s - loss: 173428.0288 - acc: 0.0000e+00 - val_loss: 173187.3412 - val_acc: 0.0000e+00
Epoch 142/500
 - 2s - loss: 173124.6239 - acc: 0.0000e+00 - val_loss: 172866.8039 - val_acc: 0.0000e+00
Epoch 143/500
 - 2s - loss: 172793.6353 - acc: 0.0000e+00 - val_loss: 172529.6396 - val_acc: 0.0000e+00
Epoch 144/500
 - 2s - loss: 172523.0024 - acc: 0.0000e+00 - val_loss: 172232.5899 - val_acc: 0.0000e+00
Epoch 145/500
 - 2s - loss: 172297.6709 - acc: 0.0000e+00 - val_loss: 171899.2591 - val_acc: 0.0000e+00
Epoch 146/500
 - 2s - loss: 171976.2375 - acc: 0.0000e+00 - val_loss: 171572.7133 - val_acc: 0.0000e+00
Epoch 147/500
 - 2s - loss: 171752.3841 - acc: 0.0000e+00 - val_loss: 171291.9471 - val_acc: 0.0000e+00
Epoch 148/500
 - 2s - loss: 171162.3094 - acc: 0.0000e+00 - val_loss: 170975.3312 - val_acc: 0.0000e+00
Epoch 149/500
 - 2s - loss: 170903.4530 - acc: 0.0000e+00 - val_loss: 170647.9497 - val_acc: 0.0000e+00
Epoch 150/500
 - 2s - loss: 170708.6626 - acc: 0.0000e+00 - val_loss: 170361.4426 - val_acc: 0.0000e+00
Epoch 151/500
 - 2s - loss: 170524.1417 - acc: 0.0000e+00 - val_loss: 170060.2050 - val_acc: 2.8760e-04
Epoch 152/500
 - 2s - loss: 170242.3502 - acc: 2.8760e-04 - val_loss: 169728.2799 - val_acc: 0.0000e+00
Epoch 153/500
 - 2s - loss: 169962.3401 - acc: 0.0000e+00 - val_loss: 169416.0171 - val_acc: 0.0000e+00
Epoch 154/500
 - 2s - loss: 169702.7952 - acc: 0.0000e+00 - val_loss: 169155.4697 - val_acc: 2.8760e-04
Epoch 155/500
 - 2s - loss: 169182.8902 - acc: 0.0000e+00 - val_loss: 168819.1732 - val_acc: 2.8760e-04
Epoch 156/500
 - 2s - loss: 168995.0300 - acc: 2.8760e-04 - val_loss: 168523.8319 - val_acc: 2.8760e-04
Epoch 157/500
 - 2s - loss: 168804.0758 - acc: 2.8760e-04 - val_loss: 168185.9636 - val_acc: 0.0000e+00
Epoch 158/500
 - 2s - loss: 168191.5185 - acc: 0.0000e+00 - val_loss: 167904.0125 - val_acc: 0.0000e+00
Epoch 159/500
 - 2s - loss: 168160.3843 - acc: 0.0000e+00 - val_loss: 167647.9119 - val_acc: 0.0000e+00
Epoch 160/500
 - 2s - loss: 168010.0011 - acc: 0.0000e+00 - val_loss: 167345.5596 - val_acc: 0.0000e+00
Epoch 161/500
 - 2s - loss: 167200.0176 - acc: 2.8760e-04 - val_loss: 167019.4958 - val_acc: 0.0000e+00
Epoch 162/500
 - 2s - loss: 167286.1218 - acc: 0.0000e+00 - val_loss: 166635.0658 - val_acc: 0.0000e+00
Epoch 163/500
 - 2s - loss: 167080.4452 - acc: 0.0000e+00 - val_loss: 166473.3401 - val_acc: 0.0000e+00
Epoch 164/500
 - 2s - loss: 166466.0709 - acc: 0.0000e+00 - val_loss: 166136.9338 - val_acc: 2.8760e-04
Epoch 165/500
 - 2s - loss: 166228.6813 - acc: 0.0000e+00 - val_loss: 165869.2517 - val_acc: 5.7521e-04
Epoch 166/500
 - 2s - loss: 166097.4674 - acc: 0.0000e+00 - val_loss: 165572.6432 - val_acc: 2.8760e-04
Epoch 167/500
 - 2s - loss: 165569.6036 - acc: 0.0000e+00 - val_loss: 165287.6017 - val_acc: 0.0000e+00
Epoch 168/500
 - 2s - loss: 165623.5437 - acc: 0.0000e+00 - val_loss: 164953.8091 - val_acc: 2.8760e-04
Epoch 169/500
 - 2s - loss: 165307.4956 - acc: 2.8760e-04 - val_loss: 164622.1161 - val_acc: 2.8760e-04
Epoch 170/500
 - 2s - loss: 164783.2370 - acc: 0.0000e+00 - val_loss: 164341.8462 - val_acc: 0.0000e+00
Epoch 171/500
 - 2s - loss: 164384.8154 - acc: 0.0000e+00 - val_loss: 164031.0053 - val_acc: 0.0000e+00
Epoch 172/500
 - 2s - loss: 164162.9165 - acc: 0.0000e+00 - val_loss: 163689.1634 - val_acc: 0.0000e+00
Epoch 173/500
 - 2s - loss: 164125.1459 - acc: 0.0000e+00 - val_loss: 163421.2106 - val_acc: 0.0000e+00
Epoch 174/500
 - 2s - loss: 163615.3991 - acc: 0.0000e+00 - val_loss: 163097.5967 - val_acc: 0.0000e+00
Epoch 175/500
 - 2s - loss: 163489.0387 - acc: 0.0000e+00 - val_loss: 162873.1600 - val_acc: 0.0000e+00
Epoch 176/500
 - 2s - loss: 163155.9230 - acc: 0.0000e+00 - val_loss: 162581.9887 - val_acc: 0.0000e+00
Epoch 177/500
 - 2s - loss: 162859.4050 - acc: 0.0000e+00 - val_loss: 162198.9720 - val_acc: 0.0000e+00
Epoch 178/500
 - 2s - loss: 162879.7991 - acc: 0.0000e+00 - val_loss: 162051.9035 - val_acc: 0.0000e+00
Epoch 179/500
 - 2s - loss: 162299.2808 - acc: 0.0000e+00 - val_loss: 161689.6037 - val_acc: 2.8760e-04
Epoch 180/500
 - 2s - loss: 162125.8359 - acc: 0.0000e+00 - val_loss: 161388.0723 - val_acc: 2.8760e-04
Epoch 181/500
 - 2s - loss: 161834.3630 - acc: 0.0000e+00 - val_loss: 161094.3696 - val_acc: 2.8760e-04
Epoch 182/500
 - 2s - loss: 161552.4347 - acc: 0.0000e+00 - val_loss: 160935.1006 - val_acc: 0.0000e+00
Epoch 183/500
 - 2s - loss: 161391.3173 - acc: 0.0000e+00 - val_loss: 160625.5333 - val_acc: 0.0000e+00
Epoch 184/500
 - 2s - loss: 160741.8211 - acc: 0.0000e+00 - val_loss: 160230.8716 - val_acc: 0.0000e+00
Epoch 185/500
 - 2s - loss: 160698.9086 - acc: 0.0000e+00 - val_loss: 159992.5158 - val_acc: 2.8760e-04
Epoch 186/500
 - 2s - loss: 160452.7113 - acc: 0.0000e+00 - val_loss: 159711.3022 - val_acc: 2.8760e-04
Epoch 187/500
 - 2s - loss: 160235.0938 - acc: 0.0000e+00 - val_loss: 159384.2852 - val_acc: 2.8760e-04
Epoch 188/500
 - 2s - loss: 159797.2247 - acc: 2.8760e-04 - val_loss: 159102.7106 - val_acc: 0.0000e+00
Epoch 189/500
 - 2s - loss: 159415.5979 - acc: 2.8760e-04 - val_loss: 158851.5803 - val_acc: 0.0000e+00
Epoch 190/500
 - 2s - loss: 159382.6426 - acc: 0.0000e+00 - val_loss: 158572.6205 - val_acc: 0.0000e+00
Epoch 191/500
 - 2s - loss: 158959.7986 - acc: 0.0000e+00 - val_loss: 158245.0662 - val_acc: 0.0000e+00
Epoch 192/500
 - 2s - loss: 158587.9760 - acc: 2.8760e-04 - val_loss: 157980.4719 - val_acc: 0.0000e+00
Epoch 193/500
 - 2s - loss: 158600.9266 - acc: 0.0000e+00 - val_loss: 157756.4615 - val_acc: 0.0000e+00
Epoch 194/500
 - 2s - loss: 158078.9056 - acc: 0.0000e+00 - val_loss: 157390.7395 - val_acc: 0.0000e+00
Epoch 195/500
 - 2s - loss: 157891.4568 - acc: 0.0000e+00 - val_loss: 157187.0537 - val_acc: 0.0000e+00
Epoch 196/500
 - 2s - loss: 157615.6274 - acc: 0.0000e+00 - val_loss: 156915.5112 - val_acc: 0.0000e+00
Epoch 197/500
 - 2s - loss: 157160.0082 - acc: 0.0000e+00 - val_loss: 156648.8380 - val_acc: 0.0000e+00
Epoch 198/500
 - 3s - loss: 157200.2425 - acc: 0.0000e+00 - val_loss: 156421.0472 - val_acc: 0.0000e+00
Epoch 199/500
 - 2s - loss: 156869.6605 - acc: 2.8760e-04 - val_loss: 156175.4612 - val_acc: 0.0000e+00
Epoch 200/500
 - 2s - loss: 156672.3571 - acc: 0.0000e+00 - val_loss: 155890.2040 - val_acc: 0.0000e+00
Epoch 201/500
 - 2s - loss: 156051.5479 - acc: 2.8760e-04 - val_loss: 155504.1347 - val_acc: 0.0000e+00
Epoch 202/500
 - 2s - loss: 155901.3640 - acc: 0.0000e+00 - val_loss: 155323.2393 - val_acc: 0.0000e+00
Epoch 203/500
 - 2s - loss: 155789.8756 - acc: 0.0000e+00 - val_loss: 155009.7290 - val_acc: 0.0000e+00
Epoch 204/500
 - 2s - loss: 155466.7994 - acc: 0.0000e+00 - val_loss: 154763.4590 - val_acc: 0.0000e+00
Epoch 205/500
 - 2s - loss: 155367.1725 - acc: 0.0000e+00 - val_loss: 154476.1140 - val_acc: 0.0000e+00
Epoch 206/500
 - 2s - loss: 154999.4332 - acc: 2.8760e-04 - val_loss: 154220.2388 - val_acc: 0.0000e+00
Epoch 207/500
 - 2s - loss: 154750.5173 - acc: 0.0000e+00 - val_loss: 153974.6749 - val_acc: 0.0000e+00
Epoch 208/500
 - 2s - loss: 154418.7816 - acc: 0.0000e+00 - val_loss: 153694.9579 - val_acc: 0.0000e+00
Epoch 209/500
 - 2s - loss: 153963.3343 - acc: 2.8760e-04 - val_loss: 153424.6389 - val_acc: 0.0000e+00
Epoch 210/500
 - 2s - loss: 154237.0511 - acc: 2.8760e-04 - val_loss: 153219.9716 - val_acc: 0.0000e+00
Epoch 211/500
 - 2s - loss: 153530.8117 - acc: 5.7521e-04 - val_loss: 152898.6119 - val_acc: 0.0000e+00
Epoch 212/500
 - 2s - loss: 153115.9693 - acc: 2.8760e-04 - val_loss: 152613.0596 - val_acc: 0.0000e+00
Epoch 213/500
 - 2s - loss: 153052.8169 - acc: 0.0000e+00 - val_loss: 152363.2343 - val_acc: 0.0000e+00
Epoch 214/500
 - 2s - loss: 153112.2147 - acc: 0.0000e+00 - val_loss: 152118.4079 - val_acc: 0.0000e+00
Epoch 215/500
 - 2s - loss: 152293.8418 - acc: 0.0000e+00 - val_loss: 151860.1592 - val_acc: 0.0000e+00
Epoch 216/500
 - 2s - loss: 152309.8142 - acc: 2.8760e-04 - val_loss: 151585.1005 - val_acc: 0.0000e+00
Epoch 217/500
 - 2s - loss: 152003.4079 - acc: 0.0000e+00 - val_loss: 151318.5452 - val_acc: 0.0000e+00
Epoch 218/500
 - 2s - loss: 151801.8089 - acc: 0.0000e+00 - val_loss: 151011.6174 - val_acc: 0.0000e+00
Epoch 219/500
 - 2s - loss: 151493.6602 - acc: 0.0000e+00 - val_loss: 150879.5737 - val_acc: 0.0000e+00
Epoch 220/500
 - 2s - loss: 151245.8471 - acc: 0.0000e+00 - val_loss: 150603.1636 - val_acc: 0.0000e+00
Epoch 221/500
 - 2s - loss: 151178.9075 - acc: 0.0000e+00 - val_loss: 150353.0207 - val_acc: 0.0000e+00
Epoch 222/500
 - 2s - loss: 150897.8741 - acc: 0.0000e+00 - val_loss: 150050.8866 - val_acc: 0.0000e+00
Epoch 223/500
 - 2s - loss: 150456.2675 - acc: 0.0000e+00 - val_loss: 149789.9268 - val_acc: 0.0000e+00
Epoch 224/500
 - 2s - loss: 149981.1968 - acc: 0.0000e+00 - val_loss: 149544.8592 - val_acc: 0.0000e+00
Epoch 225/500
 - 2s - loss: 150066.3879 - acc: 0.0000e+00 - val_loss: 149230.6581 - val_acc: 0.0000e+00
Epoch 226/500
 - 2s - loss: 149841.1425 - acc: 0.0000e+00 - val_loss: 149056.8353 - val_acc: 0.0000e+00
Epoch 227/500
 - 2s - loss: 149628.7708 - acc: 0.0000e+00 - val_loss: 148798.5240 - val_acc: 0.0000e+00
Epoch 228/500
 - 3s - loss: 149403.6729 - acc: 2.8760e-04 - val_loss: 148508.6356 - val_acc: 0.0000e+00
Epoch 229/500
 - 2s - loss: 148956.9944 - acc: 0.0000e+00 - val_loss: 148288.1646 - val_acc: 0.0000e+00
Epoch 230/500
 - 2s - loss: 148783.9153 - acc: 2.8760e-04 - val_loss: 147999.7455 - val_acc: 0.0000e+00
Epoch 231/500
 - 2s - loss: 148449.5945 - acc: 0.0000e+00 - val_loss: 147765.9182 - val_acc: 0.0000e+00
Epoch 232/500
 - 2s - loss: 148363.2919 - acc: 0.0000e+00 - val_loss: 147546.8759 - val_acc: 0.0000e+00
Epoch 233/500
 - 2s - loss: 147776.0751 - acc: 0.0000e+00 - val_loss: 147245.9764 - val_acc: 0.0000e+00
Epoch 234/500
 - 2s - loss: 147665.7166 - acc: 0.0000e+00 - val_loss: 146982.8459 - val_acc: 0.0000e+00
Epoch 235/500
 - 2s - loss: 147344.9490 - acc: 0.0000e+00 - val_loss: 146738.0363 - val_acc: 0.0000e+00
Epoch 236/500
 - 2s - loss: 147255.7611 - acc: 0.0000e+00 - val_loss: 146477.6938 - val_acc: 0.0000e+00
Epoch 237/500
 - 2s - loss: 146784.6851 - acc: 0.0000e+00 - val_loss: 146258.7967 - val_acc: 0.0000e+00
Epoch 238/500
 - 2s - loss: 146617.4036 - acc: 2.8760e-04 - val_loss: 145986.0381 - val_acc: 0.0000e+00
Epoch 239/500
 - 2s - loss: 146464.8262 - acc: 2.8760e-04 - val_loss: 145794.0066 - val_acc: 0.0000e+00
Epoch 240/500
 - 2s - loss: 146127.2701 - acc: 0.0000e+00 - val_loss: 145522.6462 - val_acc: 0.0000e+00
Epoch 241/500
 - 2s - loss: 145850.6584 - acc: 0.0000e+00 - val_loss: 145217.4114 - val_acc: 0.0000e+00
Epoch 242/500
 - 2s - loss: 145499.3587 - acc: 0.0000e+00 - val_loss: 144984.3418 - val_acc: 0.0000e+00
Epoch 243/500
 - 2s - loss: 145512.0066 - acc: 2.8760e-04 - val_loss: 144774.6197 - val_acc: 0.0000e+00
Epoch 244/500
 - 2s - loss: 145126.7895 - acc: 0.0000e+00 - val_loss: 144492.1389 - val_acc: 0.0000e+00
Epoch 245/500
 - 2s - loss: 145074.4676 - acc: 0.0000e+00 - val_loss: 144267.9684 - val_acc: 0.0000e+00
Epoch 246/500
 - 2s - loss: 144660.4512 - acc: 0.0000e+00 - val_loss: 143979.6775 - val_acc: 2.8760e-04
Epoch 247/500
 - 2s - loss: 144638.7024 - acc: 0.0000e+00 - val_loss: 143802.4433 - val_acc: 2.8760e-04
Epoch 248/500
 - 2s - loss: 144454.3364 - acc: 0.0000e+00 - val_loss: 143505.5637 - val_acc: 2.8760e-04
Epoch 249/500
 - 2s - loss: 144139.4769 - acc: 2.8760e-04 - val_loss: 143312.1368 - val_acc: 2.8760e-04
Epoch 250/500
 - 2s - loss: 143674.0164 - acc: 0.0000e+00 - val_loss: 143054.1678 - val_acc: 2.8760e-04
Epoch 251/500
 - 2s - loss: 143524.1866 - acc: 0.0000e+00 - val_loss: 142870.9143 - val_acc: 0.0000e+00
Epoch 252/500
 - 2s - loss: 143225.3097 - acc: 0.0000e+00 - val_loss: 142587.4751 - val_acc: 0.0000e+00
Epoch 253/500
 - 2s - loss: 142930.0695 - acc: 0.0000e+00 - val_loss: 142342.4352 - val_acc: 0.0000e+00
Epoch 254/500
 - 2s - loss: 142998.2570 - acc: 0.0000e+00 - val_loss: 142147.3379 - val_acc: 0.0000e+00
Epoch 255/500
 - 2s - loss: 142605.9415 - acc: 0.0000e+00 - val_loss: 141887.8703 - val_acc: 0.0000e+00
Epoch 256/500
 - 2s - loss: 142385.0274 - acc: 0.0000e+00 - val_loss: 141638.3684 - val_acc: 0.0000e+00
Epoch 257/500
 - 2s - loss: 142104.4011 - acc: 2.8760e-04 - val_loss: 141385.3904 - val_acc: 0.0000e+00
Epoch 258/500
 - 2s - loss: 142162.0529 - acc: 0.0000e+00 - val_loss: 141174.9069 - val_acc: 0.0000e+00
Epoch 259/500
 - 2s - loss: 141958.3432 - acc: 2.8760e-04 - val_loss: 140969.4075 - val_acc: 0.0000e+00
Epoch 260/500
 - 2s - loss: 141576.0149 - acc: 0.0000e+00 - val_loss: 140745.6025 - val_acc: 0.0000e+00
Epoch 261/500
 - 2s - loss: 141045.0811 - acc: 0.0000e+00 - val_loss: 140424.7214 - val_acc: 0.0000e+00
Epoch 262/500
 - 2s - loss: 141099.4340 - acc: 0.0000e+00 - val_loss: 140245.1252 - val_acc: 0.0000e+00
Epoch 263/500
 - 2s - loss: 140779.1084 - acc: 0.0000e+00 - val_loss: 140008.7503 - val_acc: 0.0000e+00
Epoch 264/500
 - 2s - loss: 140512.2265 - acc: 0.0000e+00 - val_loss: 139788.5445 - val_acc: 0.0000e+00
Epoch 265/500
 - 2s - loss: 140775.8771 - acc: 0.0000e+00 - val_loss: 139569.0363 - val_acc: 0.0000e+00
Epoch 266/500
 - 2s - loss: 140206.0051 - acc: 0.0000e+00 - val_loss: 139323.5103 - val_acc: 0.0000e+00
Epoch 267/500
 - 2s - loss: 139714.1097 - acc: 0.0000e+00 - val_loss: 139044.7612 - val_acc: 0.0000e+00
Epoch 268/500
 - 2s - loss: 139433.3812 - acc: 0.0000e+00 - val_loss: 138833.4457 - val_acc: 0.0000e+00
Epoch 269/500
 - 2s - loss: 139084.2115 - acc: 0.0000e+00 - val_loss: 138590.7889 - val_acc: 0.0000e+00
Epoch 270/500
 - 2s - loss: 138948.2853 - acc: 0.0000e+00 - val_loss: 138394.5507 - val_acc: 0.0000e+00
Epoch 271/500
 - 2s - loss: 138977.0617 - acc: 0.0000e+00 - val_loss: 138223.1965 - val_acc: 0.0000e+00
Epoch 272/500
 - 2s - loss: 138273.3202 - acc: 0.0000e+00 - val_loss: 137901.3763 - val_acc: 0.0000e+00
Epoch 273/500
 - 2s - loss: 138723.5378 - acc: 0.0000e+00 - val_loss: 137739.5425 - val_acc: 0.0000e+00
Epoch 274/500
 - 2s - loss: 138305.8302 - acc: 0.0000e+00 - val_loss: 137421.3930 - val_acc: 0.0000e+00
Epoch 275/500
 - 2s - loss: 137892.5265 - acc: 2.8760e-04 - val_loss: 137280.3918 - val_acc: 0.0000e+00
Epoch 276/500
 - 2s - loss: 137678.6718 - acc: 0.0000e+00 - val_loss: 136917.0710 - val_acc: 0.0000e+00
Epoch 277/500
 - 2s - loss: 138072.5832 - acc: 2.8760e-04 - val_loss: 136844.8789 - val_acc: 0.0000e+00
Epoch 278/500
 - 2s - loss: 137260.6344 - acc: 0.0000e+00 - val_loss: 136619.5742 - val_acc: 0.0000e+00
Epoch 279/500
 - 2s - loss: 137465.0830 - acc: 0.0000e+00 - val_loss: 136337.8390 - val_acc: 0.0000e+00
Epoch 280/500
 - 2s - loss: 136639.5057 - acc: 0.0000e+00 - val_loss: 136066.2178 - val_acc: 0.0000e+00
Epoch 281/500
 - 2s - loss: 136817.2178 - acc: 0.0000e+00 - val_loss: 135889.8101 - val_acc: 0.0000e+00
Epoch 282/500
 - 2s - loss: 136469.6263 - acc: 0.0000e+00 - val_loss: 135631.8158 - val_acc: 0.0000e+00
Epoch 283/500
 - 2s - loss: 135843.9206 - acc: 0.0000e+00 - val_loss: 135425.1382 - val_acc: 0.0000e+00
Epoch 284/500
 - 2s - loss: 135961.8998 - acc: 2.8760e-04 - val_loss: 135245.8427 - val_acc: 0.0000e+00
Epoch 285/500
 - 2s - loss: 135270.7461 - acc: 0.0000e+00 - val_loss: 134910.0718 - val_acc: 0.0000e+00
Epoch 286/500
 - 2s - loss: 135498.9362 - acc: 0.0000e+00 - val_loss: 134770.0466 - val_acc: 0.0000e+00
Epoch 287/500
 - 3s - loss: 134886.6966 - acc: 0.0000e+00 - val_loss: 134501.4029 - val_acc: 0.0000e+00
Epoch 288/500
 - 2s - loss: 135099.5701 - acc: 0.0000e+00 - val_loss: 134277.0943 - val_acc: 0.0000e+00
Epoch 289/500
 - 2s - loss: 135162.2676 - acc: 0.0000e+00 - val_loss: 134140.9472 - val_acc: 0.0000e+00
Epoch 290/500
 - 2s - loss: 134555.0574 - acc: 0.0000e+00 - val_loss: 133855.0343 - val_acc: 0.0000e+00
Epoch 291/500
 - 2s - loss: 134353.6519 - acc: 2.8760e-04 - val_loss: 133622.1077 - val_acc: 2.8760e-04
Epoch 292/500
 - 3s - loss: 133992.2645 - acc: 0.0000e+00 - val_loss: 133410.5673 - val_acc: 2.8760e-04
Epoch 293/500
 - 2s - loss: 133847.2031 - acc: 0.0000e+00 - val_loss: 133155.7503 - val_acc: 2.8760e-04
Epoch 294/500
 - 2s - loss: 133778.7678 - acc: 0.0000e+00 - val_loss: 132969.6993 - val_acc: 2.8760e-04
Epoch 295/500
 - 2s - loss: 133383.6468 - acc: 0.0000e+00 - val_loss: 132734.5014 - val_acc: 2.8760e-04
Epoch 296/500
 - 2s - loss: 133531.9903 - acc: 0.0000e+00 - val_loss: 132514.2400 - val_acc: 2.8760e-04
Epoch 297/500
 - 2s - loss: 133053.1592 - acc: 0.0000e+00 - val_loss: 132297.4555 - val_acc: 2.8760e-04
Epoch 298/500
 - 2s - loss: 133302.1895 - acc: 0.0000e+00 - val_loss: 132077.3451 - val_acc: 2.8760e-04
Epoch 299/500
 - 2s - loss: 132689.0420 - acc: 0.0000e+00 - val_loss: 131857.6665 - val_acc: 2.8760e-04
Epoch 300/500
 - 2s - loss: 132473.5216 - acc: 0.0000e+00 - val_loss: 131657.2425 - val_acc: 2.8760e-04
Epoch 301/500
 - 2s - loss: 132070.9815 - acc: 0.0000e+00 - val_loss: 131403.7001 - val_acc: 2.8760e-04
Epoch 302/500
 - 2s - loss: 132391.9094 - acc: 0.0000e+00 - val_loss: 131229.8746 - val_acc: 2.8760e-04
Epoch 303/500
 - 2s - loss: 131707.0519 - acc: 0.0000e+00 - val_loss: 130998.9775 - val_acc: 2.8760e-04
Epoch 304/500
 - 2s - loss: 131770.6114 - acc: 0.0000e+00 - val_loss: 130860.9783 - val_acc: 2.8760e-04
Epoch 305/500
 - 2s - loss: 131316.9740 - acc: 0.0000e+00 - val_loss: 130569.3525 - val_acc: 2.8760e-04
Epoch 306/500
 - 2s - loss: 131226.9930 - acc: 0.0000e+00 - val_loss: 130390.5280 - val_acc: 2.8760e-04
Epoch 307/500
 - 2s - loss: 131082.0830 - acc: 0.0000e+00 - val_loss: 130142.1672 - val_acc: 2.8760e-04
Epoch 308/500
 - 2s - loss: 130818.9197 - acc: 0.0000e+00 - val_loss: 129940.8789 - val_acc: 2.8760e-04
Epoch 309/500
 - 2s - loss: 130634.6874 - acc: 0.0000e+00 - val_loss: 129719.4185 - val_acc: 0.0000e+00
Epoch 310/500
 - 2s - loss: 130407.1236 - acc: 0.0000e+00 - val_loss: 129534.5963 - val_acc: 0.0000e+00
Epoch 311/500
 - 2s - loss: 130050.6562 - acc: 2.8760e-04 - val_loss: 129318.2759 - val_acc: 0.0000e+00
Epoch 312/500
 - 2s - loss: 129741.3528 - acc: 0.0000e+00 - val_loss: 129077.4203 - val_acc: 0.0000e+00
Epoch 313/500
 - 2s - loss: 129648.2650 - acc: 2.8760e-04 - val_loss: 128897.3552 - val_acc: 0.0000e+00
Epoch 314/500
 - 2s - loss: 129079.6248 - acc: 0.0000e+00 - val_loss: 128679.0532 - val_acc: 0.0000e+00
Epoch 315/500
 - 2s - loss: 129015.5273 - acc: 0.0000e+00 - val_loss: 128445.0817 - val_acc: 0.0000e+00
Epoch 316/500
 - 2s - loss: 128854.6595 - acc: 0.0000e+00 - val_loss: 128233.0038 - val_acc: 0.0000e+00
Epoch 317/500
 - 2s - loss: 128528.2127 - acc: 2.8760e-04 - val_loss: 128067.1797 - val_acc: 0.0000e+00
Epoch 318/500
 - 3s - loss: 128362.8472 - acc: 0.0000e+00 - val_loss: 127800.0737 - val_acc: 0.0000e+00
Epoch 319/500
 - 2s - loss: 128830.5754 - acc: 0.0000e+00 - val_loss: 127632.3559 - val_acc: 0.0000e+00
Epoch 320/500
 - 2s - loss: 127922.3702 - acc: 0.0000e+00 - val_loss: 127393.0715 - val_acc: 0.0000e+00
Epoch 321/500
 - 2s - loss: 128191.6115 - acc: 0.0000e+00 - val_loss: 127212.7134 - val_acc: 0.0000e+00
Epoch 322/500
 - 2s - loss: 127737.3787 - acc: 0.0000e+00 - val_loss: 127005.5230 - val_acc: 0.0000e+00
Epoch 323/500
 - 2s - loss: 127223.6184 - acc: 0.0000e+00 - val_loss: 126800.6605 - val_acc: 0.0000e+00
Epoch 324/500
 - 2s - loss: 127267.6568 - acc: 0.0000e+00 - val_loss: 126647.8408 - val_acc: 0.0000e+00
Epoch 325/500
 - 2s - loss: 127213.1606 - acc: 0.0000e+00 - val_loss: 126412.9828 - val_acc: 0.0000e+00
Epoch 326/500
 - 2s - loss: 126879.9754 - acc: 0.0000e+00 - val_loss: 126213.7384 - val_acc: 0.0000e+00
Epoch 327/500
 - 2s - loss: 127034.6654 - acc: 0.0000e+00 - val_loss: 126006.6070 - val_acc: 0.0000e+00
Epoch 328/500
 - 2s - loss: 126579.0483 - acc: 0.0000e+00 - val_loss: 125799.9845 - val_acc: 0.0000e+00
Epoch 329/500
 - 2s - loss: 126431.9550 - acc: 0.0000e+00 - val_loss: 125591.3756 - val_acc: 0.0000e+00
Epoch 330/500
 - 2s - loss: 126025.4994 - acc: 0.0000e+00 - val_loss: 125358.1238 - val_acc: 0.0000e+00
Epoch 331/500
 - 2s - loss: 126414.0748 - acc: 0.0000e+00 - val_loss: 125422.7192 - val_acc: 0.0000e+00
Epoch 332/500
 - 2s - loss: 125656.5781 - acc: 2.8760e-04 - val_loss: 125285.3505 - val_acc: 0.0000e+00
Epoch 333/500
 - 2s - loss: 125506.2421 - acc: 0.0000e+00 - val_loss: 125072.9172 - val_acc: 0.0000e+00
Epoch 334/500
 - 2s - loss: 125291.1552 - acc: 0.0000e+00 - val_loss: 124818.9148 - val_acc: 0.0000e+00
Epoch 335/500
 - 2s - loss: 125665.7994 - acc: 0.0000e+00 - val_loss: 124656.9003 - val_acc: 0.0000e+00
Epoch 336/500
 - 2s - loss: 125133.5321 - acc: 0.0000e+00 - val_loss: 124464.1303 - val_acc: 2.8760e-04
Epoch 337/500
 - 2s - loss: 125129.6220 - acc: 0.0000e+00 - val_loss: 124254.1999 - val_acc: 2.8760e-04
Epoch 338/500
 - 2s - loss: 125087.9206 - acc: 2.8760e-04 - val_loss: 124007.1367 - val_acc: 2.8760e-04
Epoch 339/500
 - 2s - loss: 124544.2235 - acc: 0.0000e+00 - val_loss: 123795.1931 - val_acc: 2.8760e-04
Epoch 340/500
 - 2s - loss: 124476.6017 - acc: 2.8760e-04 - val_loss: 123546.4458 - val_acc: 2.8760e-04
Epoch 341/500
 - 2s - loss: 124409.0815 - acc: 0.0000e+00 - val_loss: 123435.3971 - val_acc: 2.8760e-04
Epoch 342/500
 - 2s - loss: 124222.2809 - acc: 0.0000e+00 - val_loss: 123210.3945 - val_acc: 2.8760e-04
Epoch 343/500
 - 2s - loss: 123483.9855 - acc: 0.0000e+00 - val_loss: 123004.7957 - val_acc: 2.8760e-04
Epoch 344/500
 - 2s - loss: 123237.5395 - acc: 0.0000e+00 - val_loss: 122784.7676 - val_acc: 2.8760e-04
Epoch 345/500
 - 2s - loss: 123495.0847 - acc: 2.8760e-04 - val_loss: 122589.2223 - val_acc: 2.8760e-04
Epoch 346/500
 - 2s - loss: 123176.3679 - acc: 0.0000e+00 - val_loss: 122412.7800 - val_acc: 2.8760e-04
Epoch 347/500
 - 2s - loss: 122947.2946 - acc: 0.0000e+00 - val_loss: 122149.0238 - val_acc: 2.8760e-04
Epoch 348/500
 - 2s - loss: 122607.1946 - acc: 0.0000e+00 - val_loss: 121988.6308 - val_acc: 2.8760e-04
Epoch 349/500
 - 2s - loss: 123112.6624 - acc: 0.0000e+00 - val_loss: 121744.0461 - val_acc: 2.8760e-04
Epoch 350/500
 - 3s - loss: 122248.9178 - acc: 0.0000e+00 - val_loss: 121546.0285 - val_acc: 2.8760e-04
Epoch 351/500
 - 3s - loss: 122417.9495 - acc: 0.0000e+00 - val_loss: 121388.6294 - val_acc: 2.8760e-04
Epoch 352/500
 - 2s - loss: 121990.3980 - acc: 0.0000e+00 - val_loss: 121189.6649 - val_acc: 2.8760e-04
Epoch 353/500
 - 2s - loss: 121513.1515 - acc: 0.0000e+00 - val_loss: 120950.4475 - val_acc: 2.8760e-04
Epoch 354/500
 - 2s - loss: 121628.1951 - acc: 0.0000e+00 - val_loss: 120770.8232 - val_acc: 2.8760e-04
Epoch 355/500
 - 2s - loss: 121141.6633 - acc: 0.0000e+00 - val_loss: 120577.8369 - val_acc: 2.8760e-04
Epoch 356/500
 - 2s - loss: 120797.2963 - acc: 0.0000e+00 - val_loss: 120368.8996 - val_acc: 2.8760e-04
Epoch 357/500
 - 2s - loss: 121282.4175 - acc: 2.8760e-04 - val_loss: 120169.9084 - val_acc: 2.8760e-04
Epoch 358/500
 - 2s - loss: 120809.6676 - acc: 0.0000e+00 - val_loss: 119981.9620 - val_acc: 2.8760e-04
Epoch 359/500
 - 2s - loss: 120432.3057 - acc: 0.0000e+00 - val_loss: 119819.1681 - val_acc: 2.8760e-04
Epoch 360/500
 - 2s - loss: 120271.7276 - acc: 0.0000e+00 - val_loss: 119587.4936 - val_acc: 2.8760e-04
Epoch 361/500
 - 2s - loss: 120509.1903 - acc: 0.0000e+00 - val_loss: 119389.7586 - val_acc: 2.8760e-04
Epoch 362/500
 - 2s - loss: 119403.7000 - acc: 0.0000e+00 - val_loss: 119171.7917 - val_acc: 0.0000e+00
Epoch 363/500
 - 2s - loss: 120639.1823 - acc: 0.0000e+00 - val_loss: 119439.4396 - val_acc: 0.0000e+00
Epoch 364/500
 - 2s - loss: 119797.8801 - acc: 0.0000e+00 - val_loss: 119160.3474 - val_acc: 0.0000e+00
Epoch 365/500
 - 2s - loss: 118745.3519 - acc: 2.8760e-04 - val_loss: 117996.5159 - val_acc: 0.0000e+00
Epoch 366/500
 - 2s - loss: 119136.0884 - acc: 0.0000e+00 - val_loss: 117812.1015 - val_acc: 0.0000e+00
Epoch 367/500
 - 2s - loss: 118339.2565 - acc: 0.0000e+00 - val_loss: 117578.3424 - val_acc: 0.0000e+00
Epoch 368/500
 - 2s - loss: 117649.0626 - acc: 0.0000e+00 - val_loss: 116664.5009 - val_acc: 2.8760e-04
Epoch 369/500
 - 2s - loss: 117700.4578 - acc: 0.0000e+00 - val_loss: 116558.5614 - val_acc: 2.8760e-04
Epoch 370/500
 - 2s - loss: 117138.0143 - acc: 0.0000e+00 - val_loss: 116268.3454 - val_acc: 2.8760e-04
Epoch 371/500
 - 2s - loss: 116837.7711 - acc: 0.0000e+00 - val_loss: 116110.7826 - val_acc: 2.8760e-04
Epoch 372/500
 - 2s - loss: 116561.4277 - acc: 0.0000e+00 - val_loss: 115830.5783 - val_acc: 2.8760e-04
Epoch 373/500
 - 2s - loss: 116927.2507 - acc: 0.0000e+00 - val_loss: 115710.2248 - val_acc: 2.8760e-04
Epoch 374/500
 - 2s - loss: 116672.1328 - acc: 0.0000e+00 - val_loss: 115489.0198 - val_acc: 2.8760e-04
Epoch 375/500
 - 2s - loss: 115780.3421 - acc: 0.0000e+00 - val_loss: 115262.0790 - val_acc: 2.8760e-04
Epoch 376/500
 - 2s - loss: 115960.9932 - acc: 0.0000e+00 - val_loss: 115038.9085 - val_acc: 2.8760e-04
Epoch 377/500
 - 2s - loss: 115530.5127 - acc: 0.0000e+00 - val_loss: 114813.6754 - val_acc: 0.0000e+00
Epoch 378/500
 - 2s - loss: 115317.4174 - acc: 0.0000e+00 - val_loss: 114693.2722 - val_acc: 0.0000e+00
Epoch 379/500
 - 3s - loss: 115141.8721 - acc: 0.0000e+00 - val_loss: 114441.8749 - val_acc: 0.0000e+00
Epoch 380/500
 - 2s - loss: 115077.0597 - acc: 0.0000e+00 - val_loss: 114240.7362 - val_acc: 0.0000e+00
Epoch 381/500
 - 2s - loss: 115287.9334 - acc: 0.0000e+00 - val_loss: 114066.8323 - val_acc: 0.0000e+00
Epoch 382/500
 - 2s - loss: 114989.8037 - acc: 0.0000e+00 - val_loss: 113839.9696 - val_acc: 0.0000e+00
Epoch 383/500
 - 2s - loss: 114200.0416 - acc: 0.0000e+00 - val_loss: 113677.0101 - val_acc: 0.0000e+00
Epoch 384/500
 - 2s - loss: 114247.6751 - acc: 0.0000e+00 - val_loss: 113528.1965 - val_acc: 0.0000e+00
Epoch 385/500
 - 2s - loss: 114054.0377 - acc: 0.0000e+00 - val_loss: 113264.1270 - val_acc: 0.0000e+00
Epoch 386/500
 - 2s - loss: 113723.0670 - acc: 0.0000e+00 - val_loss: 113045.2744 - val_acc: 0.0000e+00
Epoch 387/500
 - 2s - loss: 113786.5941 - acc: 0.0000e+00 - val_loss: 112855.6029 - val_acc: 0.0000e+00
Epoch 388/500
 - 2s - loss: 113634.7628 - acc: 0.0000e+00 - val_loss: 112633.6267 - val_acc: 0.0000e+00
Epoch 389/500
 - 2s - loss: 113396.8655 - acc: 0.0000e+00 - val_loss: 112482.6460 - val_acc: 0.0000e+00
Epoch 390/500
 - 2s - loss: 112991.9969 - acc: 0.0000e+00 - val_loss: 112256.1113 - val_acc: 0.0000e+00
Epoch 391/500
 - 2s - loss: 112913.9521 - acc: 0.0000e+00 - val_loss: 112082.2194 - val_acc: 0.0000e+00
Epoch 392/500
 - 2s - loss: 112492.2761 - acc: 0.0000e+00 - val_loss: 111849.5450 - val_acc: 0.0000e+00
Epoch 393/500
 - 2s - loss: 112715.9312 - acc: 2.8760e-04 - val_loss: 111689.9044 - val_acc: 0.0000e+00
Epoch 394/500
 - 2s - loss: 112070.0193 - acc: 0.0000e+00 - val_loss: 111487.3732 - val_acc: 0.0000e+00
Epoch 395/500
 - 2s - loss: 111965.0013 - acc: 0.0000e+00 - val_loss: 111277.0022 - val_acc: 0.0000e+00
Epoch 396/500
 - 2s - loss: 112032.1379 - acc: 2.8760e-04 - val_loss: 111080.6163 - val_acc: 0.0000e+00
Epoch 397/500
 - 2s - loss: 111746.1851 - acc: 0.0000e+00 - val_loss: 110865.9358 - val_acc: 0.0000e+00
Epoch 398/500
 - 2s - loss: 111350.7088 - acc: 0.0000e+00 - val_loss: 110683.5162 - val_acc: 0.0000e+00
Epoch 399/500
 - 2s - loss: 111192.6004 - acc: 0.0000e+00 - val_loss: 110493.2994 - val_acc: 0.0000e+00
Epoch 400/500
 - 2s - loss: 111064.0273 - acc: 0.0000e+00 - val_loss: 110349.7869 - val_acc: 0.0000e+00
Epoch 401/500
 - 2s - loss: 111135.3876 - acc: 0.0000e+00 - val_loss: 110157.0301 - val_acc: 0.0000e+00
Epoch 402/500
 - 2s - loss: 110886.2793 - acc: 0.0000e+00 - val_loss: 109948.8815 - val_acc: 0.0000e+00
Epoch 403/500
 - 2s - loss: 111133.0543 - acc: 0.0000e+00 - val_loss: 109758.7219 - val_acc: 0.0000e+00
Epoch 404/500
 - 2s - loss: 110534.4626 - acc: 0.0000e+00 - val_loss: 109576.6390 - val_acc: 0.0000e+00
Epoch 405/500
 - 2s - loss: 109774.7329 - acc: 2.8760e-04 - val_loss: 109396.6508 - val_acc: 0.0000e+00
Epoch 406/500
 - 2s - loss: 109774.2747 - acc: 0.0000e+00 - val_loss: 109144.6542 - val_acc: 0.0000e+00
Epoch 407/500
 - 2s - loss: 110380.5870 - acc: 0.0000e+00 - val_loss: 108972.1907 - val_acc: 0.0000e+00
Epoch 408/500
 - 2s - loss: 109792.3153 - acc: 0.0000e+00 - val_loss: 108749.0949 - val_acc: 0.0000e+00
Epoch 409/500
 - 3s - loss: 109318.5718 - acc: 0.0000e+00 - val_loss: 108578.2403 - val_acc: 0.0000e+00
Epoch 410/500
 - 2s - loss: 109326.5530 - acc: 0.0000e+00 - val_loss: 108398.6258 - val_acc: 0.0000e+00
Epoch 411/500
 - 2s - loss: 109079.9669 - acc: 0.0000e+00 - val_loss: 108180.8901 - val_acc: 0.0000e+00
Epoch 412/500
 - 2s - loss: 108734.4508 - acc: 0.0000e+00 - val_loss: 107940.2394 - val_acc: 0.0000e+00
Epoch 413/500
 - 2s - loss: 108726.4618 - acc: 2.8760e-04 - val_loss: 107790.0739 - val_acc: 0.0000e+00
Epoch 414/500
 - 2s - loss: 108553.8092 - acc: 2.8760e-04 - val_loss: 107601.8967 - val_acc: 0.0000e+00
Epoch 415/500
 - 2s - loss: 108274.0472 - acc: 2.8760e-04 - val_loss: 107391.1087 - val_acc: 0.0000e+00
Epoch 416/500
 - 2s - loss: 108062.3303 - acc: 0.0000e+00 - val_loss: 107212.7808 - val_acc: 0.0000e+00
Epoch 417/500
 - 2s - loss: 108198.9622 - acc: 0.0000e+00 - val_loss: 107011.9945 - val_acc: 0.0000e+00
Epoch 418/500
 - 2s - loss: 108044.6150 - acc: 0.0000e+00 - val_loss: 106828.9488 - val_acc: 0.0000e+00
Epoch 419/500
 - 2s - loss: 107435.8329 - acc: 0.0000e+00 - val_loss: 106676.6987 - val_acc: 0.0000e+00
Epoch 420/500
 - 2s - loss: 107447.0995 - acc: 0.0000e+00 - val_loss: 106476.0243 - val_acc: 0.0000e+00
Epoch 421/500
 - 2s - loss: 106990.7731 - acc: 0.0000e+00 - val_loss: 106316.3703 - val_acc: 0.0000e+00
Epoch 422/500
 - 2s - loss: 106989.1432 - acc: 0.0000e+00 - val_loss: 106114.7194 - val_acc: 0.0000e+00
Epoch 423/500
 - 2s - loss: 106396.0855 - acc: 0.0000e+00 - val_loss: 105915.4027 - val_acc: 0.0000e+00
Epoch 424/500
 - 2s - loss: 106907.2003 - acc: 0.0000e+00 - val_loss: 105736.8322 - val_acc: 0.0000e+00
Epoch 425/500
 - 2s - loss: 106558.9849 - acc: 0.0000e+00 - val_loss: 105543.9781 - val_acc: 0.0000e+00
Epoch 426/500
 - 2s - loss: 106139.7156 - acc: 0.0000e+00 - val_loss: 105384.5317 - val_acc: 0.0000e+00
Epoch 427/500
 - 2s - loss: 106073.7979 - acc: 0.0000e+00 - val_loss: 105137.7199 - val_acc: 0.0000e+00
Epoch 428/500
 - 2s - loss: 106076.5889 - acc: 2.8760e-04 - val_loss: 104977.6089 - val_acc: 0.0000e+00
Epoch 429/500
 - 2s - loss: 105804.6027 - acc: 0.0000e+00 - val_loss: 104804.6289 - val_acc: 0.0000e+00
Epoch 430/500
 - 2s - loss: 105256.3703 - acc: 0.0000e+00 - val_loss: 104595.1511 - val_acc: 0.0000e+00
Epoch 431/500
 - 2s - loss: 105807.0633 - acc: 0.0000e+00 - val_loss: 104425.6364 - val_acc: 0.0000e+00
Epoch 432/500
 - 2s - loss: 104712.9182 - acc: 0.0000e+00 - val_loss: 104193.5394 - val_acc: 0.0000e+00
Epoch 433/500
 - 2s - loss: 104734.8768 - acc: 0.0000e+00 - val_loss: 104047.6493 - val_acc: 0.0000e+00
Epoch 434/500
 - 2s - loss: 104429.0161 - acc: 0.0000e+00 - val_loss: 103818.8530 - val_acc: 0.0000e+00
Epoch 435/500
 - 2s - loss: 104789.6414 - acc: 0.0000e+00 - val_loss: 103663.2740 - val_acc: 0.0000e+00
Epoch 436/500
 - 2s - loss: 104433.4344 - acc: 0.0000e+00 - val_loss: 103498.8176 - val_acc: 0.0000e+00
Epoch 437/500
 - 2s - loss: 104889.8699 - acc: 0.0000e+00 - val_loss: 103451.2628 - val_acc: 0.0000e+00
Epoch 438/500
 - 2s - loss: 104864.0465 - acc: 0.0000e+00 - val_loss: 103255.0162 - val_acc: 0.0000e+00
Epoch 439/500
 - 2s - loss: 104040.8685 - acc: 0.0000e+00 - val_loss: 103008.7451 - val_acc: 0.0000e+00
Epoch 440/500
 - 2s - loss: 104036.8276 - acc: 0.0000e+00 - val_loss: 102763.1904 - val_acc: 0.0000e+00
Epoch 441/500
 - 2s - loss: 103732.5937 - acc: 5.7521e-04 - val_loss: 102576.6484 - val_acc: 0.0000e+00
Epoch 442/500
 - 2s - loss: 103316.5982 - acc: 2.8760e-04 - val_loss: 102413.2076 - val_acc: 0.0000e+00
Epoch 443/500
 - 2s - loss: 102872.2242 - acc: 0.0000e+00 - val_loss: 102223.3422 - val_acc: 0.0000e+00
Epoch 444/500
 - 2s - loss: 102570.9777 - acc: 0.0000e+00 - val_loss: 102027.9098 - val_acc: 0.0000e+00
Epoch 445/500
 - 2s - loss: 102718.0920 - acc: 0.0000e+00 - val_loss: 101844.8713 - val_acc: 0.0000e+00
Epoch 446/500
 - 2s - loss: 102045.5794 - acc: 0.0000e+00 - val_loss: 101680.7750 - val_acc: 0.0000e+00
Epoch 447/500
 - 2s - loss: 102134.2523 - acc: 0.0000e+00 - val_loss: 101477.5510 - val_acc: 0.0000e+00
Epoch 448/500
 - 2s - loss: 102052.5260 - acc: 0.0000e+00 - val_loss: 101285.5288 - val_acc: 0.0000e+00
Epoch 449/500
 - 2s - loss: 101594.2875 - acc: 0.0000e+00 - val_loss: 101109.7019 - val_acc: 0.0000e+00
Epoch 450/500
 - 2s - loss: 101836.7719 - acc: 0.0000e+00 - val_loss: 100943.6652 - val_acc: 0.0000e+00
Epoch 451/500
 - 2s - loss: 101650.6314 - acc: 0.0000e+00 - val_loss: 100745.4184 - val_acc: 0.0000e+00
Epoch 452/500
 - 2s - loss: 101238.4699 - acc: 0.0000e+00 - val_loss: 100550.1520 - val_acc: 0.0000e+00
Epoch 453/500
 - 2s - loss: 101322.3064 - acc: 0.0000e+00 - val_loss: 100375.6924 - val_acc: 0.0000e+00
Epoch 454/500
 - 2s - loss: 101109.8404 - acc: 2.8760e-04 - val_loss: 100223.6442 - val_acc: 0.0000e+00
Epoch 455/500
 - 2s - loss: 100752.8784 - acc: 0.0000e+00 - val_loss: 100033.8470 - val_acc: 0.0000e+00
Epoch 456/500
 - 2s - loss: 101362.2476 - acc: 0.0000e+00 - val_loss: 99922.4070 - val_acc: 0.0000e+00
Epoch 457/500
 - 2s - loss: 100678.4181 - acc: 0.0000e+00 - val_loss: 99667.7510 - val_acc: 0.0000e+00
Epoch 458/500
 - 2s - loss: 100373.5132 - acc: 0.0000e+00 - val_loss: 99564.4725 - val_acc: 0.0000e+00
Epoch 459/500
 - 2s - loss: 100466.4105 - acc: 0.0000e+00 - val_loss: 99343.9122 - val_acc: 0.0000e+00
Epoch 460/500
 - 2s - loss: 100088.5328 - acc: 0.0000e+00 - val_loss: 99202.9573 - val_acc: 0.0000e+00
Epoch 461/500
 - 2s - loss: 99626.3508 - acc: 0.0000e+00 - val_loss: 98993.2318 - val_acc: 0.0000e+00
Epoch 462/500
 - 2s - loss: 99258.5677 - acc: 2.8760e-04 - val_loss: 98808.4174 - val_acc: 0.0000e+00
Epoch 463/500
 - 2s - loss: 99332.5237 - acc: 0.0000e+00 - val_loss: 98597.1025 - val_acc: 0.0000e+00
Epoch 464/500
 - 2s - loss: 99095.4014 - acc: 0.0000e+00 - val_loss: 98432.5474 - val_acc: 0.0000e+00
Epoch 465/500
 - 2s - loss: 99226.5777 - acc: 0.0000e+00 - val_loss: 98223.7187 - val_acc: 0.0000e+00
Epoch 466/500
 - 2s - loss: 98936.5366 - acc: 2.8760e-04 - val_loss: 98077.4166 - val_acc: 0.0000e+00
Epoch 467/500
 - 2s - loss: 98699.9250 - acc: 0.0000e+00 - val_loss: 97904.7124 - val_acc: 0.0000e+00
Epoch 468/500
 - 3s - loss: 98909.5774 - acc: 0.0000e+00 - val_loss: 97709.9274 - val_acc: 0.0000e+00
Epoch 469/500
 - 3s - loss: 98431.2522 - acc: 0.0000e+00 - val_loss: 97502.6008 - val_acc: 0.0000e+00
Epoch 470/500
 - 2s - loss: 98540.8342 - acc: 0.0000e+00 - val_loss: 97381.7798 - val_acc: 0.0000e+00
Epoch 471/500
 - 2s - loss: 97969.8392 - acc: 0.0000e+00 - val_loss: 97178.9770 - val_acc: 0.0000e+00
Epoch 472/500
 - 2s - loss: 97845.4831 - acc: 0.0000e+00 - val_loss: 97032.6988 - val_acc: 0.0000e+00
Epoch 473/500
 - 2s - loss: 98129.0703 - acc: 2.8760e-04 - val_loss: 96842.5350 - val_acc: 0.0000e+00
Epoch 474/500
 - 2s - loss: 97265.9202 - acc: 0.0000e+00 - val_loss: 96669.7758 - val_acc: 0.0000e+00
Epoch 475/500
 - 2s - loss: 97219.3831 - acc: 0.0000e+00 - val_loss: 96507.1375 - val_acc: 0.0000e+00
Epoch 476/500
 - 2s - loss: 96995.6086 - acc: 0.0000e+00 - val_loss: 96292.6584 - val_acc: 0.0000e+00
Epoch 477/500
 - 2s - loss: 97047.4388 - acc: 0.0000e+00 - val_loss: 96151.5447 - val_acc: 0.0000e+00
Epoch 478/500
 - 2s - loss: 96775.5720 - acc: 0.0000e+00 - val_loss: 95950.7381 - val_acc: 0.0000e+00
Epoch 479/500
 - 2s - loss: 96470.9253 - acc: 0.0000e+00 - val_loss: 95802.9676 - val_acc: 0.0000e+00
Epoch 480/500
 - 2s - loss: 96606.6446 - acc: 2.8760e-04 - val_loss: 95629.9811 - val_acc: 0.0000e+00
Epoch 481/500
 - 2s - loss: 96416.2753 - acc: 0.0000e+00 - val_loss: 95434.3204 - val_acc: 0.0000e+00
Epoch 482/500
 - 2s - loss: 96372.4960 - acc: 2.8760e-04 - val_loss: 95266.8583 - val_acc: 0.0000e+00
Epoch 483/500
 - 2s - loss: 96269.1918 - acc: 2.8760e-04 - val_loss: 95098.4984 - val_acc: 0.0000e+00
Epoch 484/500
 - 3s - loss: 96293.3536 - acc: 0.0000e+00 - val_loss: 94966.5952 - val_acc: 0.0000e+00
Epoch 485/500
 - 2s - loss: 96376.9943 - acc: 0.0000e+00 - val_loss: 94873.2154 - val_acc: 0.0000e+00
Epoch 486/500
 - 2s - loss: 96008.5653 - acc: 0.0000e+00 - val_loss: 94760.8468 - val_acc: 0.0000e+00
Epoch 487/500
 - 2s - loss: 95918.8077 - acc: 0.0000e+00 - val_loss: 94555.3004 - val_acc: 0.0000e+00
Epoch 488/500
 - 2s - loss: 95363.1538 - acc: 0.0000e+00 - val_loss: 94336.3366 - val_acc: 0.0000e+00
Epoch 489/500
 - 2s - loss: 95038.5756 - acc: 0.0000e+00 - val_loss: 94161.1894 - val_acc: 0.0000e+00
Epoch 490/500
 - 2s - loss: 94864.6446 - acc: 0.0000e+00 - val_loss: 93962.7288 - val_acc: 0.0000e+00
Epoch 491/500
 - 2s - loss: 94814.0764 - acc: 0.0000e+00 - val_loss: 93803.0049 - val_acc: 0.0000e+00
Epoch 492/500
 - 2s - loss: 94703.7320 - acc: 0.0000e+00 - val_loss: 93614.7775 - val_acc: 0.0000e+00
Epoch 493/500
 - 2s - loss: 94131.7050 - acc: 0.0000e+00 - val_loss: 93449.1868 - val_acc: 0.0000e+00
Epoch 494/500
 - 2s - loss: 94446.0429 - acc: 2.8760e-04 - val_loss: 93351.3660 - val_acc: 0.0000e+00
Epoch 495/500
 - 2s - loss: 93959.9684 - acc: 0.0000e+00 - val_loss: 93054.1864 - val_acc: 0.0000e+00
Epoch 496/500
 - 2s - loss: 93791.6487 - acc: 0.0000e+00 - val_loss: 92952.5649 - val_acc: 0.0000e+00
Epoch 497/500
 - 2s - loss: 93524.1133 - acc: 0.0000e+00 - val_loss: 92721.0337 - val_acc: 0.0000e+00
Epoch 498/500
 - 2s - loss: 93566.1991 - acc: 0.0000e+00 - val_loss: 92584.3074 - val_acc: 0.0000e+00
Epoch 499/500
 - 2s - loss: 93403.5945 - acc: 0.0000e+00 - val_loss: 92405.5947 - val_acc: 0.0000e+00
Epoch 500/500
 - 2s - loss: 93290.7362 - acc: 0.0000e+00 - val_loss: 92233.8541 - val_acc: 0.0000e+00
In [168]:
# plot history
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='test val_loss')
plt.legend()
plt.show()
In [169]:
y_pred_lstm_imputed = model.predict(X_test)
plt.plot(Y_test.reshape(-1,1))
plt.plot(y_pred_lstm_imputed)
Out[169]:
[<matplotlib.lines.Line2D at 0x1e54d899630>]
In [170]:
plt.plot(Y_test.reshape(-1,1))
plt.plot(y_pred_lstm_imputed)

plt.title('LSTM predicted vs actual values')
#plt.xlabel('Actual Price')
#plt.ylabel('Predicted Price')
#plt.legend(Y_test,y_pred_lstm_imputed)
plt.show()
In [171]:
print(Y_test.shape,X_test.shape,y_pred_lstm_imputed.shape)
(71, 1) (71, 1, 10) (71, 1)

Check and updated shape of X_test. Scale X_train and x_test on correct shape of dataframe.

In [172]:
#Inverse transform the prediction and Y_test
X_test = X_test.reshape((X_test.shape[0],X_test.shape[2]))

# Concatenate with X_test to reshape before inverse transform
y_pred_lstm_imputed = concatenate((y_pred_lstm_imputed,X_test),axis=1)
print(X_test.shape,y_pred_lstm_imputed.shape,Y_test.shape)

#Invert scaling for forecast
y_pred_lstm_imputed = scaler_df_imputed.inverse_transform(y_pred_lstm_imputed)
print(X_test.shape,y_pred_lstm_imputed.shape,Y_test.shape)

#slice reverse transformed y_pred values to compare with Y_test actual values
y_pred_lstm_imputed = y_pred_lstm_imputed[:,0:1]
(71, 10) (71, 11) (71, 1)
(71, 10) (71, 11) (71, 1)
In [173]:
#Reconsturct Y_test with X_test and inverse transform it.
Y_test = concatenate((Y_test,X_test),axis=1)
Y_test = scaler_df_imputed.inverse_transform(Y_test)
#slice reverse transformed y_pred values to compare with Y_test actual values
Y_test = Y_test[:,0:1]
In [174]:
print(X_test.shape,y_pred_lstm_imputed.shape,Y_test.shape)
(71, 10) (71, 1) (71, 1)

Evaluate Model LSTM model

In [175]:
#Calculate MSE between actual values Y_test and inverse transformed y_pred
mse = mean_squared_error(Y_test,y_pred_lstm_imputed)
print('Means Square Error between Y_test adn prediction values is :', mse)
Means Square Error between Y_test adn prediction values is : 982416894554.094
In [176]:
print('predicted- {} ; Actual - {}'.format(y_pred_lstm_imputed[-1], Y_test[-1]))
predicted- [297487.70399435] ; Actual - [1460579.96104185]
In [177]:
r2score = r2_score(Y_test,y_pred_lstm_imputed)
print('R2 score between Y_test adn prediction value is :', r2score)
R2 score between Y_test adn prediction value is : -45.0776194785916
In [178]:
model.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_14 (LSTM)               (None, 512)               1071104   
_________________________________________________________________
dropout_9 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 513       
=================================================================
Total params: 1,071,617
Trainable params: 1,071,617
Non-trainable params: 0
_________________________________________________________________

Apply Linear Regression model

In [179]:
# from sklearn import linear_model
# from sklearn.metrics import mean_squared_error, r2_score

Plotting the Least Squares Line

Best observed high variance features Vs target label 'Value_SP500_REAL_PRICE_MONTH' to understand linear corelation.

This is to find colinearity between features and the prediction line for selected set of features. Predicted line would try to fit with linear regression model through the feature data points.

The dataset here is very imbalanced and non linear which makes it different to predict a linear line with optimum coefficients and intercept.
In [180]:
 #5 features with highest variance.
max_var_cols1 = Print_PCAfeatures_graph(df_imputed,a,-10)

# visualize relationship between the features and the target price using scatterplots
sns.pairplot(df_imputed, x_vars=max_var_cols1,y_vars=['Value_SP500_REAL_PRICE_MONTH'], height=10, aspect=0.5,kind='reg')
<Figure size 3000x2000 with 0 Axes>
Out[180]:
<seaborn.axisgrid.PairGrid at 0x1e55f6b4f28>
In [181]:
 #5 feature with highest PCA variance.
max_var_cols2 = Print_PCAfeatures_graph(df_interpolate,b,-10)
print(max_var_cols2)

# visualize relationship between the features and the target price using seaborn scatterplot. 
#High variance features Vs target label
sns.pairplot(df_interpolate, x_vars=max_var_cols2,y_vars=['Value_SP500_REAL_PRICE_MONTH'], height=10, aspect=0.5,kind='reg')
<Figure size 3000x2000 with 0 Axes>
['Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_PBV_RATIO_YEAR', 'Value_SP500_PE_RATIO_MONTH', 'Value_SP500_PBV_RATIO_QUARTER', 'Value_SP500_REAL_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_SALES_YEAR', 'Value_SHILLER_PE_RATIO_YEAR', 'Value_SP500_EARNINGS_YEAR', 'Value_SP500_DIV_YIELD_MONTH', 'Value_SP500_REAL_SALES_GROWTH_YEAR']
Out[181]:
<seaborn.axisgrid.PairGrid at 0x1e560dd9278>
In [182]:
 #5 feature with highest PCA variance.
max_var_cols3 = Print_PCAfeatures_graph(df_interpolate,b,10)
print(max_var_cols3)

# visualize relationship between the features and the target price using seaborn scatterplot. 
#High variance features Vs target label
sns.pairplot(df_interpolate, x_vars=max_var_cols3,y_vars=['Value_SP500_REAL_PRICE_MONTH'], height=10, aspect=0.5,kind='reg')
<Figure size 3000x2000 with 0 Axes>
['Value_SP500_DIV_GROWTH_YEAR', 'Value_SP500_DIV_YEAR', 'Value_SP500_REAL_SALES_GROWTH_QUARTER', 'Value_SP500_SALES_YEAR', 'Value_SP500_SALES_QUARTER', 'Value_SP500_SALES_GROWTH_QUARTER', 'Value_SP500_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_EARNINGS_GROWTH_YEAR', 'Value_SHILLER_PE_RATIO_MONTH', 'Value_SP500_PSR_YEAR']
Out[182]:
<seaborn.axisgrid.PairGrid at 0x1e5708805c0>

Split into training and test dataset for Linear Regression

In [183]:
#Split data into train and test
#X_train, Y_train, X_test, Y_test = Create_Training_Test_Dataset(df_imputed,0.8,Linear_regr='True')
In [184]:
scaler = StandardScaler()
scaled_interpolated = scaler.fit_transform(df_PCA_features.values)
df_scaled_interpolated = pd.DataFrame(scaled_interpolated,index=df_PCA_features.index,columns = df_PCA_features.columns)
In [185]:
# Split the size into 80% and 20% row-wise
split_percent = 0.8
train_size = int(len(df_PCA_features) * split_percent)
test_size = len(df_PCA_features) - train_size
#print(train_size,test_size)
print('Training and Test dataset is of size {} & {}'.format(train_size,test_size))

#Slice the df into train and test df.
train = df_PCA_features.iloc[0:train_size,:]
test = df_PCA_features.iloc[train_size:len(df),:]
print(train.shape, test.shape)
Training and Test dataset is of size 2838 & 710
(2838, 11) (710, 11)
In [186]:
# #Slice the df into train and test df.
# train = df_scaled_interpolated.iloc[0:train_size,:]
# test = df_scaled_interpolated.iloc[train_size:len(df),:]
# print(train.shape, test.shape)
In [187]:
# Slice Train dataset.
temp_train = train.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
X_train = train.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
Y_train = train['Value_SP500_REAL_PRICE_MONTH']
print('Features size of X_train and training target Y_train shape is {} & {}'.format(X_train.shape,Y_train.shape))

#Test dataset
X_test = test.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
Y_test = test['Value_SP500_REAL_PRICE_MONTH']
print('Features size of X_test and Test target Y_test shape is {} & {}'.format(X_test.shape,Y_test.shape))
Features size of X_train and training target Y_train shape is (2838, 10) & (2838,)
Features size of X_test and Test target Y_test shape is (710, 10) & (710,)
In [188]:
#np.array(temp_train[:]).reshape(-1,1)
#print((np.array(temp_train[:]).reshape(-1,2)).shape)

Instantiate linear regression model from scikit-learn

In [189]:
#from sklearn.linear_model import LinearRegression
# Instantiate linear regression object
regr = LinearRegression()

# Train the model using the training sets
regr.fit(X_train, Y_train)
Out[189]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

Interpreting model coefficients

In [190]:
# print the intercept and coefficients
print(regr.intercept_)
print(regr.coef_)
137.11463021015638
[-3.89875853e+00 -5.67269650e-01 -5.46924799e+00 -6.41997252e+00
  1.29380410e-01  2.46530686e+00  0.00000000e+00 -3.98730436e-02
  1.55754077e-04  1.55754077e-04]

Making Predictions

In [191]:
#Linear regression prediction on testing set
y_pred_inter_regr = regr.predict(X_test)
In [192]:
print(y_pred_inter_regr.shape,X_test.shape,Y_test.shape)
(710,) (710, 10) (710,)

Evaluate

Model evaluation metrics for linear regression

In [193]:
feature_cols = X_train.columns
# pair the feature names with the coefficients
print('Coefficients of all feature variables in training dataset are: ')
list(zip(feature_cols, regr.coef_))
Coefficients of all feature variables in training dataset are: 
Out[193]:
[('Value_SP500_DIV_YIELD_MONTH', -3.8987585275669585),
 ('Value_SP500_PE_RATIO_MONTH', -0.5672696500428746),
 ('Value_SHILLER_PE_RATIO_MONTH', -5.469247991877379),
 ('Value_SP500_EARNINGS_YIELD_MONTH', -6.41997252365831),
 ('Value_SP500_INFLADJ_MONTH', 0.12938040986247704),
 ('Value_SP500_EARNINGS_MONTH', 2.465306856497017),
 ('Value_SP500_PSR_QUARTER', 0.0),
 ('Value_SP500_SALES_QUARTER', -0.03987304360362631),
 ('Value_SP500_REAL_SALES_GROWTH_QUARTER', 0.00015575407657666528),
 ('Value_SP500_REAL_EARNINGS_GROWTH_QUARTER', 0.00015575407657666528)]
In [194]:
# The coefficients
#print('Coefficients: \n', regr.coef_)

# The mean squared error:  is always non-negative, and values closer to zero are better.
print('Mean squared error: {}'.format(mean_squared_error(Y_test, y_pred_inter_regr)))

# The Root mean squared error: RMSE is popular than MSE, because RMSE is interpretable in the "y" units.
print('Root Mean squared error: {}'.format(np.sqrt(mean_squared_error(Y_test, y_pred_inter_regr))))

# Explained variance score: 1 is perfect prediction
print('Variance R2 score: {}'.format(r2_score(Y_test, y_pred_inter_regr)))
print('intercept: ',regr.intercept_)
print('Score : ',regr.score)
Mean squared error: 1076172.6990871658
Root Mean squared error: 1037.3874392372243
Variance R2 score: -1.9382920775190278
intercept:  137.11463021015638
Score :  <bound method RegressorMixin.score of LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)>
In [195]:
#Reshape prediction and test set to concatenate with X_test to get origninal form of test dataset.
y_pred_inter_regr = y_pred_inter_regr[:,np.newaxis]
Y_test = Y_test[:,np.newaxis]
print(X_train.shape,X_test.shape,y_pred_inter_regr.shape,Y_test.shape)
y_pred_inter_regr = concatenate((y_pred_inter_regr, X_test), axis=1)
print(X_test.shape,y_pred_inter_regr.shape,Y_test.shape)
(2838, 10) (710, 10) (710, 1) (710, 1)
(710, 10) (710, 11) (710, 1)
In [196]:
# # Plot outputs
# for col in feature_cols:
#     plt.scatter(X_test[col], Y_test)
#     #plt.plot(X_test[col], y_pred_imputed_regr)
#     #print(col)

# plt.xticks(())
# plt.yticks(())
# plt.show()
In [197]:
feature_cols = X_train.columns
# pair the feature names with the coefficients
print('Coefficients of all feature variables in training dataset are: ')
list(zip(feature_cols, regr.coef_))
Coefficients of all feature variables in training dataset are: 
Out[197]:
[('Value_SP500_DIV_YIELD_MONTH', -3.8987585275669585),
 ('Value_SP500_PE_RATIO_MONTH', -0.5672696500428746),
 ('Value_SHILLER_PE_RATIO_MONTH', -5.469247991877379),
 ('Value_SP500_EARNINGS_YIELD_MONTH', -6.41997252365831),
 ('Value_SP500_INFLADJ_MONTH', 0.12938040986247704),
 ('Value_SP500_EARNINGS_MONTH', 2.465306856497017),
 ('Value_SP500_PSR_QUARTER', 0.0),
 ('Value_SP500_SALES_QUARTER', -0.03987304360362631),
 ('Value_SP500_REAL_SALES_GROWTH_QUARTER', 0.00015575407657666528),
 ('Value_SP500_REAL_EARNINGS_GROWTH_QUARTER', 0.00015575407657666528)]

MSE, RMSE and R2 score are very bad with all features and PCA selected features.

In [198]:
# from fbprophet import Prophet
# # plt.style.available
# plt.style.use("seaborn-whitegrid")
# import plotly.figure_factory as ff
# import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import statsmodels.api as sm
# from scipy import stats
In [199]:
#Reset index of df to make it a column as prerequisite for fbProphet.
df_fb = df_imputed.reset_index()
df_interpolate_fb = df_interpolate.reset_index()
df_interpolate_fb.tail() #for further use.

#Change dtype of Date column to datetime64 for fbProphet
df_interpolate_fb.loc[:,'Date'] = pd.to_datetime(df_interpolate_fb.loc[:,'Date'],format = '%Y%m%d')
In [200]:
#Plotting of S&P interpolated price vs date 
# First Subplot
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14,5))
ax1.plot(df_interpolate_fb['Date'], df_interpolate_fb['Value_SP500_REAL_PRICE_MONTH'],color ='blue')
#ax1.plot(df_fb['Date'], df_fb['Value_SP500_REAL_PRICE_MONTH'],color='red')
ax1.set_xlabel("Date", fontsize=12)
ax1.set_ylabel("Real Price")
ax1.set_title("S&P 500 Index price History")

# #Second Subplot- original plot
# ax1.plot(df_fb['Date'], df_fb['Value_SP500_REAL_PRICE_MONTH'],color='red')
# ax1.set_xlabel("Date", fontsize=12)
# ax1.set_ylabel("Original S&P 500 Real Index price")
# ax1.set_title("S&P 500 Index price History")


#Third subplot
ax2.plot(df_interpolate_fb['Date'], df_interpolate_fb['Value_SP500_REAL_SALES_GROWTH_YEAR'],color='orange')
ax2.set_xlabel("Date", fontsize=12)
ax2.set_ylabel("SP500_REAL_SALES_GROWTH_YEAR")
ax2.set_title("SP500_REAL_SALES_GROWTH_YEAR")
plt.show()

Instantiate fbProphet

In [201]:
#Data preparation for fbprophet model. model takes date/timeseries and target label as input for fit and predict
df_interpolate_fb = df_interpolate_fb[['Date','Value_SP500_REAL_PRICE_MONTH']]

#Rename column names to 'ds' and 'y' as required by fbprophet model.
df_interpolate_fb.rename(columns={'Date':'ds','Value_SP500_REAL_PRICE_MONTH' : 'y'},inplace=True)

print(df_interpolate_fb.info(),df_interpolate_fb.shape)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3548 entries, 0 to 3547
Data columns (total 2 columns):
ds    3548 non-null datetime64[ns]
y     3548 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 55.5 KB
None (3548, 2)
In [202]:
fb = Prophet()
In [203]:
fb.fit(df_interpolate_fb)
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
C:\Users\Akshat\Anaconda3\lib\site-packages\pystan\misc.py:399: FutureWarning:

Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

Out[203]:
<fbprophet.forecaster.Prophet at 0x1e5515c5278>
In [204]:
#Create future dates
future_dates = fb.make_future_dataframe(periods=30)

#Predict prices for future dates
future_price = fb.predict(future_dates)
In [205]:
future_price[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
Out[205]:
ds yhat yhat_lower yhat_upper
3573 2019-01-26 2104.546587 1953.566674 2269.087231
3574 2019-01-27 2109.010824 1959.552406 2266.648516
3575 2019-01-28 2109.505319 1955.279627 2277.497805
3576 2019-01-29 2111.732478 1961.375006 2265.803894
3577 2019-01-30 2115.552914 1964.223491 2276.856778
In [206]:
import matplotlib.dates as mdates
In [207]:
# Dates
starting_date = dt.datetime(2018, 11, 30)
starting_date1 = mdates.date2num(starting_date)
trend_date = dt.datetime(2019, 1, 1)
trend_date1 = mdates.date2num(trend_date)

pointing_arrow = dt.datetime(2019, 1, 1)
pointing_arrow1 = mdates.date2num(pointing_arrow)

# Plot.
fig = fb.plot(future_price)
ax1 = fig.add_subplot(111)
ax1.set_title("S&P500 Index Price Forecast", fontsize=16)
ax1.set_xlabel("Date", fontsize=12)
ax1.set_ylabel("Real Price", fontsize=12)

# Forecast initialization arrow
ax1.annotate('Forecast \n Initialization', xy=(pointing_arrow1, 2100), xytext=(starting_date1,2500),
            arrowprops=dict(facecolor='#ff7f50', shrink=0.1),
            )

# Trend emphasis arrow
ax1.annotate('Upward Trend', xy=(trend_date1, 2108), xytext=(trend_date1,950),
            arrowprops=dict(facecolor='#6cff6c', shrink=0.1),
            )

ax1.axhline(y=1260, color='b', linestyle='-')

plt.show()
C:\Users\Akshat\Anaconda3\lib\site-packages\matplotlib\cbook\deprecation.py:107: MatplotlibDeprecationWarning:

Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.

In [208]:
fig2 = fb.plot_components(future_price)
plt.show()
In [209]:
# Monthly Data Predictions
fbm = Prophet(changepoint_prior_scale=0.01).fit(df_interpolate_fb)
future = fbm.make_future_dataframe(periods=12, freq='M')
fcst = fbm.predict(future)
fig = fbm.plot(fcst)
plt.title("Monthly Prediction \n 1 year time frame")

plt.show()
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
In [210]:
fig3 = fbm.plot_components(fcst)
plt.show()